xref: /llvm-project/bolt/lib/Passes/BinaryPasses.cpp (revision 344228ebf45f9bd1f7626fdcd3c0fada0f0c8385)
1 //===- bolt/Passes/BinaryPasses.cpp - Binary-level passes -----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements multiple passes for binary optimization and analysis.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "bolt/Passes/BinaryPasses.h"
14 #include "bolt/Core/FunctionLayout.h"
15 #include "bolt/Core/ParallelUtilities.h"
16 #include "bolt/Passes/ReorderAlgorithm.h"
17 #include "bolt/Passes/ReorderFunctions.h"
18 #include "llvm/Support/CommandLine.h"
19 #include <atomic>
20 #include <mutex>
21 #include <numeric>
22 #include <vector>
23 
24 #define DEBUG_TYPE "bolt-opts"
25 
26 using namespace llvm;
27 using namespace bolt;
28 
29 static const char *dynoStatsOptName(const bolt::DynoStats::Category C) {
30   assert(C > bolt::DynoStats::FIRST_DYNO_STAT &&
31          C < DynoStats::LAST_DYNO_STAT && "Unexpected dyno stat category.");
32 
33   static std::string OptNames[bolt::DynoStats::LAST_DYNO_STAT + 1];
34 
35   OptNames[C] = bolt::DynoStats::Description(C);
36 
37   std::replace(OptNames[C].begin(), OptNames[C].end(), ' ', '-');
38 
39   return OptNames[C].c_str();
40 }
41 
42 namespace opts {
43 
44 extern cl::OptionCategory BoltCategory;
45 extern cl::OptionCategory BoltOptCategory;
46 
47 extern cl::opt<unsigned> Verbosity;
48 extern cl::opt<bool> EnableBAT;
49 extern cl::opt<unsigned> ExecutionCountThreshold;
50 extern cl::opt<bool> UpdateDebugSections;
51 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions;
52 
53 enum DynoStatsSortOrder : char {
54   Ascending,
55   Descending
56 };
57 
58 static cl::opt<DynoStatsSortOrder> DynoStatsSortOrderOpt(
59     "print-sorted-by-order",
60     cl::desc("use ascending or descending order when printing functions "
61              "ordered by dyno stats"),
62     cl::init(DynoStatsSortOrder::Descending), cl::cat(BoltOptCategory));
63 
64 cl::list<std::string>
65 HotTextMoveSections("hot-text-move-sections",
66   cl::desc("list of sections containing functions used for hugifying hot text. "
67            "BOLT makes sure these functions are not placed on the same page as "
68            "the hot text. (default=\'.stub,.mover\')."),
69   cl::value_desc("sec1,sec2,sec3,..."),
70   cl::CommaSeparated,
71   cl::ZeroOrMore,
72   cl::cat(BoltCategory));
73 
74 bool isHotTextMover(const BinaryFunction &Function) {
75   for (std::string &SectionName : opts::HotTextMoveSections) {
76     if (Function.getOriginSectionName() &&
77         *Function.getOriginSectionName() == SectionName)
78       return true;
79   }
80 
81   return false;
82 }
83 
84 static cl::opt<bool> MinBranchClusters(
85     "min-branch-clusters",
86     cl::desc("use a modified clustering algorithm geared towards minimizing "
87              "branches"),
88     cl::Hidden, cl::cat(BoltOptCategory));
89 
90 static cl::list<Peepholes::PeepholeOpts> Peepholes(
91     "peepholes", cl::CommaSeparated, cl::desc("enable peephole optimizations"),
92     cl::value_desc("opt1,opt2,opt3,..."),
93     cl::values(clEnumValN(Peepholes::PEEP_NONE, "none", "disable peepholes"),
94                clEnumValN(Peepholes::PEEP_DOUBLE_JUMPS, "double-jumps",
95                           "remove double jumps when able"),
96                clEnumValN(Peepholes::PEEP_TAILCALL_TRAPS, "tailcall-traps",
97                           "insert tail call traps"),
98                clEnumValN(Peepholes::PEEP_USELESS_BRANCHES, "useless-branches",
99                           "remove useless conditional branches"),
100                clEnumValN(Peepholes::PEEP_ALL, "all",
101                           "enable all peephole optimizations")),
102     cl::ZeroOrMore, cl::cat(BoltOptCategory));
103 
104 static cl::opt<unsigned>
105     PrintFuncStat("print-function-statistics",
106                   cl::desc("print statistics about basic block ordering"),
107                   cl::init(0), cl::cat(BoltOptCategory));
108 
109 static cl::opt<bool> PrintLargeFunctions(
110     "print-large-functions",
111     cl::desc("print functions that could not be overwritten due to excessive "
112              "size"),
113     cl::init(false), cl::cat(BoltOptCategory));
114 
115 static cl::list<bolt::DynoStats::Category>
116     PrintSortedBy("print-sorted-by", cl::CommaSeparated,
117                   cl::desc("print functions sorted by order of dyno stats"),
118                   cl::value_desc("key1,key2,key3,..."),
119                   cl::values(
120 #define D(name, description, ...)                                              \
121   clEnumValN(bolt::DynoStats::name, dynoStatsOptName(bolt::DynoStats::name),   \
122              description),
123                       REAL_DYNO_STATS
124 #undef D
125                           clEnumValN(bolt::DynoStats::LAST_DYNO_STAT, "all",
126                                      "sorted by all names")),
127                   cl::ZeroOrMore, cl::cat(BoltOptCategory));
128 
129 static cl::opt<bool>
130     PrintUnknown("print-unknown",
131                  cl::desc("print names of functions with unknown control flow"),
132                  cl::cat(BoltCategory), cl::Hidden);
133 
134 static cl::opt<bool>
135     PrintUnknownCFG("print-unknown-cfg",
136                     cl::desc("dump CFG of functions with unknown control flow"),
137                     cl::cat(BoltCategory), cl::ReallyHidden);
138 
139 // Please MSVC19 with a forward declaration: otherwise it reports an error about
140 // an undeclared variable inside a callback.
141 extern cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks;
142 cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks(
143     "reorder-blocks", cl::desc("change layout of basic blocks in a function"),
144     cl::init(bolt::ReorderBasicBlocks::LT_NONE),
145     cl::values(
146         clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, "none",
147                    "do not reorder basic blocks"),
148         clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, "reverse",
149                    "layout blocks in reverse order"),
150         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, "normal",
151                    "perform optimal layout based on profile"),
152         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH,
153                    "branch-predictor",
154                    "perform optimal layout prioritizing branch "
155                    "predictions"),
156         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, "cache",
157                    "perform optimal layout prioritizing I-cache "
158                    "behavior"),
159         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS, "cache+",
160                    "perform layout optimizing I-cache behavior"),
161         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "ext-tsp",
162                    "perform layout optimizing I-cache behavior"),
163         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE,
164                    "cluster-shuffle", "perform random layout of clusters")),
165     cl::ZeroOrMore, cl::cat(BoltOptCategory),
166     cl::callback([](const bolt::ReorderBasicBlocks::LayoutType &option) {
167       if (option == bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS) {
168         errs() << "BOLT-WARNING: '-reorder-blocks=cache+' is deprecated, please"
169                << " use '-reorder-blocks=ext-tsp' instead\n";
170         ReorderBlocks = bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP;
171       }
172     }));
173 
174 static cl::opt<unsigned> ReportBadLayout(
175     "report-bad-layout",
176     cl::desc("print top <uint> functions with suboptimal code layout on input"),
177     cl::init(0), cl::Hidden, cl::cat(BoltOptCategory));
178 
179 static cl::opt<bool>
180     ReportStaleFuncs("report-stale",
181                      cl::desc("print the list of functions with stale profile"),
182                      cl::Hidden, cl::cat(BoltOptCategory));
183 
184 enum SctcModes : char {
185   SctcAlways,
186   SctcPreserveDirection,
187   SctcHeuristic
188 };
189 
190 static cl::opt<SctcModes>
191 SctcMode("sctc-mode",
192   cl::desc("mode for simplify conditional tail calls"),
193   cl::init(SctcAlways),
194   cl::values(clEnumValN(SctcAlways, "always", "always perform sctc"),
195     clEnumValN(SctcPreserveDirection,
196       "preserve",
197       "only perform sctc when branch direction is "
198       "preserved"),
199     clEnumValN(SctcHeuristic,
200       "heuristic",
201       "use branch prediction data to control sctc")),
202   cl::ZeroOrMore,
203   cl::cat(BoltOptCategory));
204 
205 static cl::opt<unsigned>
206 StaleThreshold("stale-threshold",
207     cl::desc(
208       "maximum percentage of stale functions to tolerate (default: 100)"),
209     cl::init(100),
210     cl::Hidden,
211     cl::cat(BoltOptCategory));
212 
213 static cl::opt<unsigned> TSPThreshold(
214     "tsp-threshold",
215     cl::desc(
216         "maximum number of hot basic blocks in a function for which to use "
217         "a precise TSP solution while re-ordering basic blocks"),
218     cl::init(10), cl::Hidden, cl::cat(BoltOptCategory));
219 
220 static cl::opt<unsigned> TopCalledLimit(
221     "top-called-limit",
222     cl::desc("maximum number of functions to print in top called "
223              "functions section"),
224     cl::init(100), cl::Hidden, cl::cat(BoltCategory));
225 
226 } // namespace opts
227 
228 namespace llvm {
229 namespace bolt {
230 
231 bool BinaryFunctionPass::shouldOptimize(const BinaryFunction &BF) const {
232   return BF.isSimple() && BF.getState() == BinaryFunction::State::CFG &&
233          !BF.isIgnored();
234 }
235 
236 bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const {
237   return BF.isSimple() && !BF.isIgnored();
238 }
239 
240 void NormalizeCFG::runOnFunction(BinaryFunction &BF) {
241   uint64_t NumRemoved = 0;
242   uint64_t NumDuplicateEdges = 0;
243   uint64_t NeedsFixBranches = 0;
244   for (BinaryBasicBlock &BB : BF) {
245     if (!BB.empty())
246       continue;
247 
248     if (BB.isEntryPoint() || BB.isLandingPad())
249       continue;
250 
251     // Handle a dangling empty block.
252     if (BB.succ_size() == 0) {
253       // If an empty dangling basic block has a predecessor, it could be a
254       // result of codegen for __builtin_unreachable. In such case, do not
255       // remove the block.
256       if (BB.pred_size() == 0) {
257         BB.markValid(false);
258         ++NumRemoved;
259       }
260       continue;
261     }
262 
263     // The block should have just one successor.
264     BinaryBasicBlock *Successor = BB.getSuccessor();
265     assert(Successor && "invalid CFG encountered");
266 
267     // Redirect all predecessors to the successor block.
268     while (!BB.pred_empty()) {
269       BinaryBasicBlock *Predecessor = *BB.pred_begin();
270       if (Predecessor->hasJumpTable())
271         break;
272 
273       if (Predecessor == Successor)
274         break;
275 
276       BinaryBasicBlock::BinaryBranchInfo &BI = Predecessor->getBranchInfo(BB);
277       Predecessor->replaceSuccessor(&BB, Successor, BI.Count,
278                                     BI.MispredictedCount);
279       // We need to fix branches even if we failed to replace all successors
280       // and remove the block.
281       NeedsFixBranches = true;
282     }
283 
284     if (BB.pred_empty()) {
285       BB.removeAllSuccessors();
286       BB.markValid(false);
287       ++NumRemoved;
288     }
289   }
290 
291   if (NumRemoved)
292     BF.eraseInvalidBBs();
293 
294   // Check for duplicate successors. Do it after the empty block elimination as
295   // we can get more duplicate successors.
296   for (BinaryBasicBlock &BB : BF)
297     if (!BB.hasJumpTable() && BB.succ_size() == 2 &&
298         BB.getConditionalSuccessor(false) == BB.getConditionalSuccessor(true))
299       ++NumDuplicateEdges;
300 
301   // fixBranches() will get rid of duplicate edges and update jump instructions.
302   if (NumDuplicateEdges || NeedsFixBranches)
303     BF.fixBranches();
304 
305   NumDuplicateEdgesMerged += NumDuplicateEdges;
306   NumBlocksRemoved += NumRemoved;
307 }
308 
309 Error NormalizeCFG::runOnFunctions(BinaryContext &BC) {
310   ParallelUtilities::runOnEachFunction(
311       BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR,
312       [&](BinaryFunction &BF) { runOnFunction(BF); },
313       [&](const BinaryFunction &BF) { return !shouldOptimize(BF); },
314       "NormalizeCFG");
315   if (NumBlocksRemoved)
316     BC.outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block"
317               << (NumBlocksRemoved == 1 ? "" : "s") << '\n';
318   if (NumDuplicateEdgesMerged)
319     BC.outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged
320               << " duplicate CFG edge"
321               << (NumDuplicateEdgesMerged == 1 ? "" : "s") << '\n';
322   return Error::success();
323 }
324 
325 void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) {
326   BinaryContext &BC = Function.getBinaryContext();
327   unsigned Count;
328   uint64_t Bytes;
329   Function.markUnreachableBlocks();
330   LLVM_DEBUG({
331     for (BinaryBasicBlock &BB : Function) {
332       if (!BB.isValid()) {
333         dbgs() << "BOLT-INFO: UCE found unreachable block " << BB.getName()
334                << " in function " << Function << "\n";
335         Function.dump();
336       }
337     }
338   });
339   BinaryContext::IndependentCodeEmitter Emitter =
340       BC.createIndependentMCCodeEmitter();
341   std::tie(Count, Bytes) = Function.eraseInvalidBBs(Emitter.MCE.get());
342   DeletedBlocks += Count;
343   DeletedBytes += Bytes;
344   if (Count) {
345     auto L = BC.scopeLock();
346     Modified.insert(&Function);
347     if (opts::Verbosity > 0)
348       BC.outs() << "BOLT-INFO: removed " << Count
349                 << " dead basic block(s) accounting for " << Bytes
350                 << " bytes in function " << Function << '\n';
351   }
352 }
353 
354 Error EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) {
355   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
356     runOnFunction(BF);
357   };
358 
359   ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
360     return !shouldOptimize(BF) || BF.getLayout().block_empty();
361   };
362 
363   ParallelUtilities::runOnEachFunction(
364       BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun,
365       SkipPredicate, "elimininate-unreachable");
366 
367   if (DeletedBlocks)
368     BC.outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and "
369               << DeletedBytes << " bytes of code\n";
370   return Error::success();
371 }
372 
373 bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const {
374   return (BinaryFunctionPass::shouldPrint(BF) &&
375           opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE);
376 }
377 
378 bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction &BF) const {
379   // Apply execution count threshold
380   if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
381     return false;
382 
383   return BinaryFunctionPass::shouldOptimize(BF);
384 }
385 
386 Error ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
387   if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE)
388     return Error::success();
389 
390   std::atomic_uint64_t ModifiedFuncCount(0);
391   std::mutex FunctionEditDistanceMutex;
392   DenseMap<const BinaryFunction *, uint64_t> FunctionEditDistance;
393 
394   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
395     SmallVector<const BinaryBasicBlock *, 0> OldBlockOrder;
396     if (opts::PrintFuncStat > 0)
397       llvm::copy(BF.getLayout().blocks(), std::back_inserter(OldBlockOrder));
398 
399     const bool LayoutChanged =
400         modifyFunctionLayout(BF, opts::ReorderBlocks, opts::MinBranchClusters);
401     if (LayoutChanged) {
402       ModifiedFuncCount.fetch_add(1, std::memory_order_relaxed);
403       if (opts::PrintFuncStat > 0) {
404         const uint64_t Distance = BF.getLayout().getEditDistance(OldBlockOrder);
405         std::lock_guard<std::mutex> Lock(FunctionEditDistanceMutex);
406         FunctionEditDistance[&BF] = Distance;
407       }
408     }
409   };
410 
411   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
412     return !shouldOptimize(BF);
413   };
414 
415   ParallelUtilities::runOnEachFunction(
416       BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc,
417       "ReorderBasicBlocks");
418   const size_t NumAllProfiledFunctions =
419       BC.NumProfiledFuncs + BC.NumStaleProfileFuncs;
420 
421   BC.outs() << "BOLT-INFO: basic block reordering modified layout of "
422             << format(
423                    "%zu functions (%.2lf%% of profiled, %.2lf%% of total)\n",
424                    ModifiedFuncCount.load(std::memory_order_relaxed),
425                    100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) /
426                        NumAllProfiledFunctions,
427                    100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) /
428                        BC.getBinaryFunctions().size());
429 
430   if (opts::PrintFuncStat > 0) {
431     raw_ostream &OS = BC.outs();
432     // Copy all the values into vector in order to sort them
433     std::map<uint64_t, BinaryFunction &> ScoreMap;
434     auto &BFs = BC.getBinaryFunctions();
435     for (auto It = BFs.begin(); It != BFs.end(); ++It)
436       ScoreMap.insert(std::pair<uint64_t, BinaryFunction &>(
437           It->second.getFunctionScore(), It->second));
438 
439     OS << "\nBOLT-INFO: Printing Function Statistics:\n\n";
440     OS << "           There are " << BFs.size() << " functions in total. \n";
441     OS << "           Number of functions being modified: "
442        << ModifiedFuncCount.load(std::memory_order_relaxed) << "\n";
443     OS << "           User asks for detailed information on top "
444        << opts::PrintFuncStat << " functions. (Ranked by function score)"
445        << "\n\n";
446     uint64_t I = 0;
447     for (std::map<uint64_t, BinaryFunction &>::reverse_iterator Rit =
448              ScoreMap.rbegin();
449          Rit != ScoreMap.rend() && I < opts::PrintFuncStat; ++Rit, ++I) {
450       BinaryFunction &Function = Rit->second;
451 
452       OS << "           Information for function of top: " << (I + 1) << ": \n";
453       OS << "             Function Score is: " << Function.getFunctionScore()
454          << "\n";
455       OS << "             There are " << Function.size()
456          << " number of blocks in this function.\n";
457       OS << "             There are " << Function.getInstructionCount()
458          << " number of instructions in this function.\n";
459       OS << "             The edit distance for this function is: "
460          << FunctionEditDistance.lookup(&Function) << "\n\n";
461     }
462   }
463   return Error::success();
464 }
465 
466 bool ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF,
467                                               LayoutType Type,
468                                               bool MinBranchClusters) const {
469   if (BF.size() == 0 || Type == LT_NONE)
470     return false;
471 
472   BinaryFunction::BasicBlockOrderType NewLayout;
473   std::unique_ptr<ReorderAlgorithm> Algo;
474 
475   // Cannot do optimal layout without profile.
476   if (Type != LT_REVERSE && !BF.hasValidProfile())
477     return false;
478 
479   if (Type == LT_REVERSE) {
480     Algo.reset(new ReverseReorderAlgorithm());
481   } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) {
482     // Work on optimal solution if problem is small enough
483     LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n");
484     Algo.reset(new TSPReorderAlgorithm());
485   } else {
486     LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n");
487 
488     std::unique_ptr<ClusterAlgorithm> CAlgo;
489     if (MinBranchClusters)
490       CAlgo.reset(new MinBranchGreedyClusterAlgorithm());
491     else
492       CAlgo.reset(new PHGreedyClusterAlgorithm());
493 
494     switch (Type) {
495     case LT_OPTIMIZE:
496       Algo.reset(new OptimizeReorderAlgorithm(std::move(CAlgo)));
497       break;
498 
499     case LT_OPTIMIZE_BRANCH:
500       Algo.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo)));
501       break;
502 
503     case LT_OPTIMIZE_CACHE:
504       Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo)));
505       break;
506 
507     case LT_OPTIMIZE_EXT_TSP:
508       Algo.reset(new ExtTSPReorderAlgorithm());
509       break;
510 
511     case LT_OPTIMIZE_SHUFFLE:
512       Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo)));
513       break;
514 
515     default:
516       llvm_unreachable("unexpected layout type");
517     }
518   }
519 
520   Algo->reorderBasicBlocks(BF, NewLayout);
521 
522   return BF.getLayout().update(NewLayout);
523 }
524 
525 Error FixupBranches::runOnFunctions(BinaryContext &BC) {
526   for (auto &It : BC.getBinaryFunctions()) {
527     BinaryFunction &Function = It.second;
528     if (!BC.shouldEmit(Function) || !Function.isSimple())
529       continue;
530 
531     Function.fixBranches();
532   }
533   return Error::success();
534 }
535 
536 Error FinalizeFunctions::runOnFunctions(BinaryContext &BC) {
537   std::atomic<bool> HasFatal{false};
538   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
539     if (!BF.finalizeCFIState()) {
540       if (BC.HasRelocations) {
541         BC.errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF
542                   << ". Exiting.\n";
543         HasFatal = true;
544         return;
545       }
546       BF.setSimple(false);
547       return;
548     }
549 
550     BF.setFinalized();
551 
552     // Update exception handling information.
553     BF.updateEHRanges();
554   };
555 
556   ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
557     return !BC.shouldEmit(BF);
558   };
559 
560   ParallelUtilities::runOnEachFunction(
561       BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun,
562       SkipPredicate, "FinalizeFunctions");
563   if (HasFatal)
564     return createFatalBOLTError("finalize CFI state failure");
565   return Error::success();
566 }
567 
568 Error CheckLargeFunctions::runOnFunctions(BinaryContext &BC) {
569   if (BC.HasRelocations)
570     return Error::success();
571 
572   // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit
573   // incorrect meta data.
574   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
575     uint64_t HotSize, ColdSize;
576     std::tie(HotSize, ColdSize) =
577         BC.calculateEmittedSize(BF, /*FixBranches=*/false);
578     if (HotSize > BF.getMaxSize()) {
579       if (opts::PrintLargeFunctions)
580         BC.outs() << "BOLT-INFO: " << BF << " size exceeds allocated space by "
581                   << (HotSize - BF.getMaxSize()) << " bytes\n";
582       BF.setSimple(false);
583     }
584   };
585 
586   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
587     return !shouldOptimize(BF);
588   };
589 
590   ParallelUtilities::runOnEachFunction(
591       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
592       SkipFunc, "CheckLargeFunctions");
593 
594   return Error::success();
595 }
596 
597 bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const {
598   // Unlike other passes, allow functions in non-CFG state.
599   return BF.isSimple() && !BF.isIgnored();
600 }
601 
602 Error LowerAnnotations::runOnFunctions(BinaryContext &BC) {
603   // Convert GnuArgsSize annotations into CFIs.
604   for (BinaryFunction *BF : BC.getAllBinaryFunctions()) {
605     for (FunctionFragment &FF : BF->getLayout().fragments()) {
606       // Reset at the start of the new fragment.
607       int64_t CurrentGnuArgsSize = 0;
608 
609       for (BinaryBasicBlock *const BB : FF) {
610         for (auto II = BB->begin(); II != BB->end(); ++II) {
611           if (!BF->usesGnuArgsSize() || !BC.MIB->isInvoke(*II))
612             continue;
613 
614           const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(*II);
615           assert(NewGnuArgsSize >= 0 && "Expected non-negative GNU_args_size.");
616           if (NewGnuArgsSize == CurrentGnuArgsSize)
617             continue;
618 
619           auto InsertII = BF->addCFIInstruction(
620               BB, II,
621               MCCFIInstruction::createGnuArgsSize(nullptr, NewGnuArgsSize));
622           CurrentGnuArgsSize = NewGnuArgsSize;
623           II = std::next(InsertII);
624         }
625       }
626     }
627   }
628   return Error::success();
629 }
630 
631 // Check for dirty state in MCSymbol objects that might be a consequence
632 // of running calculateEmittedSize() in parallel, during split functions
633 // pass. If an inconsistent state is found (symbol already registered or
634 // already defined), clean it.
635 Error CleanMCState::runOnFunctions(BinaryContext &BC) {
636   MCContext &Ctx = *BC.Ctx;
637   for (const auto &SymMapEntry : Ctx.getSymbols()) {
638     const MCSymbol *S = SymMapEntry.getValue().Symbol;
639     if (!S)
640       continue;
641     if (S->isDefined()) {
642       LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName()
643                         << "\" is already defined\n");
644       const_cast<MCSymbol *>(S)->setUndefined();
645     }
646     if (S->isRegistered()) {
647       LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName()
648                         << "\" is already registered\n");
649       const_cast<MCSymbol *>(S)->setIsRegistered(false);
650     }
651     LLVM_DEBUG(if (S->isVariable()) {
652       dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() << "\" is variable\n";
653     });
654   }
655   return Error::success();
656 }
657 
658 // This peephole fixes jump instructions that jump to another basic
659 // block with a single jump instruction, e.g.
660 //
661 // B0: ...
662 //     jmp  B1   (or jcc B1)
663 //
664 // B1: jmp  B2
665 //
666 // ->
667 //
668 // B0: ...
669 //     jmp  B2   (or jcc B2)
670 //
671 static uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) {
672   uint64_t NumDoubleJumps = 0;
673 
674   MCContext *Ctx = Function.getBinaryContext().Ctx.get();
675   MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
676   for (BinaryBasicBlock &BB : Function) {
677     auto checkAndPatch = [&](BinaryBasicBlock *Pred, BinaryBasicBlock *Succ,
678                              const MCSymbol *SuccSym,
679                              std::optional<uint32_t> Offset) {
680       // Ignore infinite loop jumps or fallthrough tail jumps.
681       if (Pred == Succ || Succ == &BB)
682         return false;
683 
684       if (Succ) {
685         const MCSymbol *TBB = nullptr;
686         const MCSymbol *FBB = nullptr;
687         MCInst *CondBranch = nullptr;
688         MCInst *UncondBranch = nullptr;
689         bool Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
690         if (!Res) {
691           LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n";
692                      Pred->dump());
693           return false;
694         }
695         Pred->replaceSuccessor(&BB, Succ);
696 
697         // We must patch up any existing branch instructions to match up
698         // with the new successor.
699         assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) &&
700                "Predecessor block has inconsistent number of successors");
701         if (CondBranch && MIB->getTargetSymbol(*CondBranch) == BB.getLabel()) {
702           MIB->replaceBranchTarget(*CondBranch, Succ->getLabel(), Ctx);
703         } else if (UncondBranch &&
704                    MIB->getTargetSymbol(*UncondBranch) == BB.getLabel()) {
705           MIB->replaceBranchTarget(*UncondBranch, Succ->getLabel(), Ctx);
706         } else if (!UncondBranch) {
707           assert(Function.getLayout().getBasicBlockAfter(Pred, false) != Succ &&
708                  "Don't add an explicit jump to a fallthrough block.");
709           Pred->addBranchInstruction(Succ);
710         }
711       } else {
712         // Succ will be null in the tail call case.  In this case we
713         // need to explicitly add a tail call instruction.
714         MCInst *Branch = Pred->getLastNonPseudoInstr();
715         if (Branch && MIB->isUnconditionalBranch(*Branch)) {
716           assert(MIB->getTargetSymbol(*Branch) == BB.getLabel());
717           Pred->removeSuccessor(&BB);
718           Pred->eraseInstruction(Pred->findInstruction(Branch));
719           Pred->addTailCallInstruction(SuccSym);
720           if (Offset) {
721             MCInst *TailCall = Pred->getLastNonPseudoInstr();
722             assert(TailCall);
723             MIB->setOffset(*TailCall, *Offset);
724           }
725         } else {
726           return false;
727         }
728       }
729 
730       ++NumDoubleJumps;
731       LLVM_DEBUG(dbgs() << "Removed double jump in " << Function << " from "
732                         << Pred->getName() << " -> " << BB.getName() << " to "
733                         << Pred->getName() << " -> " << SuccSym->getName()
734                         << (!Succ ? " (tail)\n" : "\n"));
735 
736       return true;
737     };
738 
739     if (BB.getNumNonPseudos() != 1 || BB.isLandingPad())
740       continue;
741 
742     MCInst *Inst = BB.getFirstNonPseudoInstr();
743     const bool IsTailCall = MIB->isTailCall(*Inst);
744 
745     if (!MIB->isUnconditionalBranch(*Inst) && !IsTailCall)
746       continue;
747 
748     // If we operate after SCTC make sure it's not a conditional tail call.
749     if (IsTailCall && MIB->isConditionalBranch(*Inst))
750       continue;
751 
752     const MCSymbol *SuccSym = MIB->getTargetSymbol(*Inst);
753     BinaryBasicBlock *Succ = BB.getSuccessor();
754 
755     if (((!Succ || &BB == Succ) && !IsTailCall) || (IsTailCall && !SuccSym))
756       continue;
757 
758     std::vector<BinaryBasicBlock *> Preds = {BB.pred_begin(), BB.pred_end()};
759 
760     for (BinaryBasicBlock *Pred : Preds) {
761       if (Pred->isLandingPad())
762         continue;
763 
764       if (Pred->getSuccessor() == &BB ||
765           (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) ||
766           Pred->getConditionalSuccessor(false) == &BB)
767         if (checkAndPatch(Pred, Succ, SuccSym, MIB->getOffset(*Inst)) &&
768             MarkInvalid)
769           BB.markValid(BB.pred_size() != 0 || BB.isLandingPad() ||
770                        BB.isEntryPoint());
771     }
772   }
773 
774   return NumDoubleJumps;
775 }
776 
777 bool SimplifyConditionalTailCalls::shouldRewriteBranch(
778     const BinaryBasicBlock *PredBB, const MCInst &CondBranch,
779     const BinaryBasicBlock *BB, const bool DirectionFlag) {
780   if (BeenOptimized.count(PredBB))
781     return false;
782 
783   const bool IsForward = BinaryFunction::isForwardBranch(PredBB, BB);
784 
785   if (IsForward)
786     ++NumOrigForwardBranches;
787   else
788     ++NumOrigBackwardBranches;
789 
790   if (opts::SctcMode == opts::SctcAlways)
791     return true;
792 
793   if (opts::SctcMode == opts::SctcPreserveDirection)
794     return IsForward == DirectionFlag;
795 
796   const ErrorOr<std::pair<double, double>> Frequency =
797       PredBB->getBranchStats(BB);
798 
799   // It's ok to rewrite the conditional branch if the new target will be
800   // a backward branch.
801 
802   // If no data available for these branches, then it should be ok to
803   // do the optimization since it will reduce code size.
804   if (Frequency.getError())
805     return true;
806 
807   // TODO: should this use misprediction frequency instead?
808   const bool Result = (IsForward && Frequency.get().first >= 0.5) ||
809                       (!IsForward && Frequency.get().first <= 0.5);
810 
811   return Result == DirectionFlag;
812 }
813 
814 uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) {
815   // Need updated indices to correctly detect branch' direction.
816   BF.getLayout().updateLayoutIndices();
817   BF.markUnreachableBlocks();
818 
819   MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get();
820   MCContext *Ctx = BF.getBinaryContext().Ctx.get();
821   uint64_t NumLocalCTCCandidates = 0;
822   uint64_t NumLocalCTCs = 0;
823   uint64_t LocalCTCTakenCount = 0;
824   uint64_t LocalCTCExecCount = 0;
825   std::vector<std::pair<BinaryBasicBlock *, const BinaryBasicBlock *>>
826       NeedsUncondBranch;
827 
828   // Will block be deleted by UCE?
829   auto isValid = [](const BinaryBasicBlock *BB) {
830     return (BB->pred_size() != 0 || BB->isLandingPad() || BB->isEntryPoint());
831   };
832 
833   for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
834     // Locate BB with a single direct tail-call instruction.
835     if (BB->getNumNonPseudos() != 1)
836       continue;
837 
838     MCInst *Instr = BB->getFirstNonPseudoInstr();
839     if (!MIB->isTailCall(*Instr) || MIB->isConditionalBranch(*Instr))
840       continue;
841 
842     const MCSymbol *CalleeSymbol = MIB->getTargetSymbol(*Instr);
843     if (!CalleeSymbol)
844       continue;
845 
846     // Detect direction of the possible conditional tail call.
847     const bool IsForwardCTC = BF.isForwardCall(CalleeSymbol);
848 
849     // Iterate through all predecessors.
850     for (BinaryBasicBlock *PredBB : BB->predecessors()) {
851       BinaryBasicBlock *CondSucc = PredBB->getConditionalSuccessor(true);
852       if (!CondSucc)
853         continue;
854 
855       ++NumLocalCTCCandidates;
856 
857       const MCSymbol *TBB = nullptr;
858       const MCSymbol *FBB = nullptr;
859       MCInst *CondBranch = nullptr;
860       MCInst *UncondBranch = nullptr;
861       bool Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
862 
863       // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz
864       if (!Result) {
865         LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n";
866                    PredBB->dump());
867         continue;
868       }
869 
870       assert(Result && "internal error analyzing conditional branch");
871       assert(CondBranch && "conditional branch expected");
872 
873       // Skip dynamic branches for now.
874       if (BF.getBinaryContext().MIB->isDynamicBranch(*CondBranch))
875         continue;
876 
877       // It's possible that PredBB is also a successor to BB that may have
878       // been processed by a previous iteration of the SCTC loop, in which
879       // case it may have been marked invalid.  We should skip rewriting in
880       // this case.
881       if (!PredBB->isValid()) {
882         assert(PredBB->isSuccessor(BB) &&
883                "PredBB should be valid if it is not a successor to BB");
884         continue;
885       }
886 
887       // We don't want to reverse direction of the branch in new order
888       // without further profile analysis.
889       const bool DirectionFlag = CondSucc == BB ? IsForwardCTC : !IsForwardCTC;
890       if (!shouldRewriteBranch(PredBB, *CondBranch, BB, DirectionFlag))
891         continue;
892 
893       // Record this block so that we don't try to optimize it twice.
894       BeenOptimized.insert(PredBB);
895 
896       uint64_t Count = 0;
897       if (CondSucc != BB) {
898         // Patch the new target address into the conditional branch.
899         MIB->reverseBranchCondition(*CondBranch, CalleeSymbol, Ctx);
900         // Since we reversed the condition on the branch we need to change
901         // the target for the unconditional branch or add a unconditional
902         // branch to the old target.  This has to be done manually since
903         // fixupBranches is not called after SCTC.
904         NeedsUncondBranch.emplace_back(PredBB, CondSucc);
905         Count = PredBB->getFallthroughBranchInfo().Count;
906       } else {
907         // Change destination of the conditional branch.
908         MIB->replaceBranchTarget(*CondBranch, CalleeSymbol, Ctx);
909         Count = PredBB->getTakenBranchInfo().Count;
910       }
911       const uint64_t CTCTakenFreq =
912           Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count;
913 
914       // Annotate it, so "isCall" returns true for this jcc
915       MIB->setConditionalTailCall(*CondBranch);
916       // Add info about the conditional tail call frequency, otherwise this
917       // info will be lost when we delete the associated BranchInfo entry
918       auto &CTCAnnotation =
919           MIB->getOrCreateAnnotationAs<uint64_t>(*CondBranch, "CTCTakenCount");
920       CTCAnnotation = CTCTakenFreq;
921       // Preserve Offset annotation, used in BAT.
922       // Instr is a direct tail call instruction that was created when CTCs are
923       // first expanded, and has the original CTC offset set.
924       if (std::optional<uint32_t> Offset = MIB->getOffset(*Instr))
925         MIB->setOffset(*CondBranch, *Offset);
926 
927       // Remove the unused successor which may be eliminated later
928       // if there are no other users.
929       PredBB->removeSuccessor(BB);
930       // Update BB execution count
931       if (CTCTakenFreq && CTCTakenFreq <= BB->getKnownExecutionCount())
932         BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq);
933       else if (CTCTakenFreq > BB->getKnownExecutionCount())
934         BB->setExecutionCount(0);
935 
936       ++NumLocalCTCs;
937       LocalCTCTakenCount += CTCTakenFreq;
938       LocalCTCExecCount += PredBB->getKnownExecutionCount();
939     }
940 
941     // Remove the block from CFG if all predecessors were removed.
942     BB->markValid(isValid(BB));
943   }
944 
945   // Add unconditional branches at the end of BBs to new successors
946   // as long as the successor is not a fallthrough.
947   for (auto &Entry : NeedsUncondBranch) {
948     BinaryBasicBlock *PredBB = Entry.first;
949     const BinaryBasicBlock *CondSucc = Entry.second;
950 
951     const MCSymbol *TBB = nullptr;
952     const MCSymbol *FBB = nullptr;
953     MCInst *CondBranch = nullptr;
954     MCInst *UncondBranch = nullptr;
955     PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
956 
957     // Find the next valid block.  Invalid blocks will be deleted
958     // so they shouldn't be considered fallthrough targets.
959     const BinaryBasicBlock *NextBlock =
960         BF.getLayout().getBasicBlockAfter(PredBB, false);
961     while (NextBlock && !isValid(NextBlock))
962       NextBlock = BF.getLayout().getBasicBlockAfter(NextBlock, false);
963 
964     // Get the unconditional successor to this block.
965     const BinaryBasicBlock *PredSucc = PredBB->getSuccessor();
966     assert(PredSucc && "The other branch should be a tail call");
967 
968     const bool HasFallthrough = (NextBlock && PredSucc == NextBlock);
969 
970     if (UncondBranch) {
971       if (HasFallthrough)
972         PredBB->eraseInstruction(PredBB->findInstruction(UncondBranch));
973       else
974         MIB->replaceBranchTarget(*UncondBranch, CondSucc->getLabel(), Ctx);
975     } else if (!HasFallthrough) {
976       MCInst Branch;
977       MIB->createUncondBranch(Branch, CondSucc->getLabel(), Ctx);
978       PredBB->addInstruction(Branch);
979     }
980   }
981 
982   if (NumLocalCTCs > 0) {
983     NumDoubleJumps += fixDoubleJumps(BF, true);
984     // Clean-up unreachable tail-call blocks.
985     const std::pair<unsigned, uint64_t> Stats = BF.eraseInvalidBBs();
986     DeletedBlocks += Stats.first;
987     DeletedBytes += Stats.second;
988 
989     assert(BF.validateCFG());
990   }
991 
992   LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs
993                     << " conditional tail calls from a total of "
994                     << NumLocalCTCCandidates << " candidates in function " << BF
995                     << ". CTCs execution count for this function is "
996                     << LocalCTCExecCount << " and CTC taken count is "
997                     << LocalCTCTakenCount << "\n";);
998 
999   NumTailCallsPatched += NumLocalCTCs;
1000   NumCandidateTailCalls += NumLocalCTCCandidates;
1001   CTCExecCount += LocalCTCExecCount;
1002   CTCTakenCount += LocalCTCTakenCount;
1003 
1004   return NumLocalCTCs > 0;
1005 }
1006 
1007 Error SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) {
1008   if (!BC.isX86())
1009     return Error::success();
1010 
1011   for (auto &It : BC.getBinaryFunctions()) {
1012     BinaryFunction &Function = It.second;
1013 
1014     if (!shouldOptimize(Function))
1015       continue;
1016 
1017     if (fixTailCalls(Function)) {
1018       Modified.insert(&Function);
1019       Function.setHasCanonicalCFG(false);
1020     }
1021   }
1022 
1023   if (NumTailCallsPatched)
1024     BC.outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched
1025               << " tail calls (" << NumOrigForwardBranches << " forward)"
1026               << " tail calls (" << NumOrigBackwardBranches << " backward)"
1027               << " from a total of " << NumCandidateTailCalls
1028               << " while removing " << NumDoubleJumps << " double jumps"
1029               << " and removing " << DeletedBlocks << " basic blocks"
1030               << " totalling " << DeletedBytes
1031               << " bytes of code. CTCs total execution count is "
1032               << CTCExecCount << " and the number of times CTCs are taken is "
1033               << CTCTakenCount << "\n";
1034   return Error::success();
1035 }
1036 
1037 uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) {
1038   uint64_t Count = 0;
1039   const BinaryContext &BC = Function.getBinaryContext();
1040   for (BinaryBasicBlock &BB : Function) {
1041     for (MCInst &Inst : BB) {
1042       // Skip shortening instructions with Size annotation.
1043       if (BC.MIB->getSize(Inst))
1044         continue;
1045 
1046       MCInst OriginalInst;
1047       if (opts::Verbosity > 2)
1048         OriginalInst = Inst;
1049 
1050       if (!BC.MIB->shortenInstruction(Inst, *BC.STI))
1051         continue;
1052 
1053       if (opts::Verbosity > 2) {
1054         BC.scopeLock();
1055         BC.outs() << "BOLT-INFO: shortening:\nBOLT-INFO:    ";
1056         BC.printInstruction(BC.outs(), OriginalInst, 0, &Function);
1057         BC.outs() << "BOLT-INFO: to:";
1058         BC.printInstruction(BC.outs(), Inst, 0, &Function);
1059       }
1060 
1061       ++Count;
1062     }
1063   }
1064 
1065   return Count;
1066 }
1067 
1068 Error ShortenInstructions::runOnFunctions(BinaryContext &BC) {
1069   std::atomic<uint64_t> NumShortened{0};
1070   if (!BC.isX86())
1071     return Error::success();
1072 
1073   ParallelUtilities::runOnEachFunction(
1074       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR,
1075       [&](BinaryFunction &BF) { NumShortened += shortenInstructions(BF); },
1076       nullptr, "ShortenInstructions");
1077 
1078   if (NumShortened)
1079     BC.outs() << "BOLT-INFO: " << NumShortened
1080               << " instructions were shortened\n";
1081   return Error::success();
1082 }
1083 
1084 void Peepholes::addTailcallTraps(BinaryFunction &Function) {
1085   MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
1086   for (BinaryBasicBlock &BB : Function) {
1087     MCInst *Inst = BB.getLastNonPseudoInstr();
1088     if (Inst && MIB->isTailCall(*Inst) && MIB->isIndirectBranch(*Inst)) {
1089       MCInst Trap;
1090       MIB->createTrap(Trap);
1091       BB.addInstruction(Trap);
1092       ++TailCallTraps;
1093     }
1094   }
1095 }
1096 
1097 void Peepholes::removeUselessCondBranches(BinaryFunction &Function) {
1098   for (BinaryBasicBlock &BB : Function) {
1099     if (BB.succ_size() != 2)
1100       continue;
1101 
1102     BinaryBasicBlock *CondBB = BB.getConditionalSuccessor(true);
1103     BinaryBasicBlock *UncondBB = BB.getConditionalSuccessor(false);
1104     if (CondBB != UncondBB)
1105       continue;
1106 
1107     const MCSymbol *TBB = nullptr;
1108     const MCSymbol *FBB = nullptr;
1109     MCInst *CondBranch = nullptr;
1110     MCInst *UncondBranch = nullptr;
1111     bool Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
1112 
1113     // analyzeBranch() can fail due to unusual branch instructions,
1114     // e.g. jrcxz, or jump tables (indirect jump).
1115     if (!Result || !CondBranch)
1116       continue;
1117 
1118     BB.removeDuplicateConditionalSuccessor(CondBranch);
1119     ++NumUselessCondBranches;
1120   }
1121 }
1122 
1123 Error Peepholes::runOnFunctions(BinaryContext &BC) {
1124   const char Opts =
1125       std::accumulate(opts::Peepholes.begin(), opts::Peepholes.end(), 0,
1126                       [](const char A, const PeepholeOpts B) { return A | B; });
1127   if (Opts == PEEP_NONE)
1128     return Error::success();
1129 
1130   for (auto &It : BC.getBinaryFunctions()) {
1131     BinaryFunction &Function = It.second;
1132     if (shouldOptimize(Function)) {
1133       if (Opts & PEEP_DOUBLE_JUMPS)
1134         NumDoubleJumps += fixDoubleJumps(Function, false);
1135       if (Opts & PEEP_TAILCALL_TRAPS)
1136         addTailcallTraps(Function);
1137       if (Opts & PEEP_USELESS_BRANCHES)
1138         removeUselessCondBranches(Function);
1139       assert(Function.validateCFG());
1140     }
1141   }
1142   BC.outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps
1143             << " double jumps patched.\n"
1144             << "BOLT-INFO: Peephole: " << TailCallTraps
1145             << " tail call traps inserted.\n"
1146             << "BOLT-INFO: Peephole: " << NumUselessCondBranches
1147             << " useless conditional branches removed.\n";
1148   return Error::success();
1149 }
1150 
1151 bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) {
1152   BinaryContext &BC = BF.getBinaryContext();
1153   MCPlusBuilder *MIB = BC.MIB.get();
1154 
1155   uint64_t NumLocalLoadsSimplified = 0;
1156   uint64_t NumDynamicLocalLoadsSimplified = 0;
1157   uint64_t NumLocalLoadsFound = 0;
1158   uint64_t NumDynamicLocalLoadsFound = 0;
1159 
1160   for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
1161     for (MCInst &Inst : *BB) {
1162       unsigned Opcode = Inst.getOpcode();
1163       const MCInstrDesc &Desc = BC.MII->get(Opcode);
1164 
1165       // Skip instructions that do not load from memory.
1166       if (!Desc.mayLoad())
1167         continue;
1168 
1169       // Try to statically evaluate the target memory address;
1170       uint64_t TargetAddress;
1171 
1172       if (MIB->hasPCRelOperand(Inst)) {
1173         // Try to find the symbol that corresponds to the PC-relative operand.
1174         MCOperand *DispOpI = MIB->getMemOperandDisp(Inst);
1175         assert(DispOpI != Inst.end() && "expected PC-relative displacement");
1176         assert(DispOpI->isExpr() &&
1177                "found PC-relative with non-symbolic displacement");
1178 
1179         // Get displacement symbol.
1180         const MCSymbol *DisplSymbol;
1181         uint64_t DisplOffset;
1182 
1183         std::tie(DisplSymbol, DisplOffset) =
1184             MIB->getTargetSymbolInfo(DispOpI->getExpr());
1185 
1186         if (!DisplSymbol)
1187           continue;
1188 
1189         // Look up the symbol address in the global symbols map of the binary
1190         // context object.
1191         BinaryData *BD = BC.getBinaryDataByName(DisplSymbol->getName());
1192         if (!BD)
1193           continue;
1194         TargetAddress = BD->getAddress() + DisplOffset;
1195       } else if (!MIB->evaluateMemOperandTarget(Inst, TargetAddress)) {
1196         continue;
1197       }
1198 
1199       // Get the contents of the section containing the target address of the
1200       // memory operand. We are only interested in read-only sections.
1201       ErrorOr<BinarySection &> DataSection =
1202           BC.getSectionForAddress(TargetAddress);
1203       if (!DataSection || DataSection->isWritable())
1204         continue;
1205 
1206       if (BC.getRelocationAt(TargetAddress) ||
1207           BC.getDynamicRelocationAt(TargetAddress))
1208         continue;
1209 
1210       uint32_t Offset = TargetAddress - DataSection->getAddress();
1211       StringRef ConstantData = DataSection->getContents();
1212 
1213       ++NumLocalLoadsFound;
1214       if (BB->hasProfile())
1215         NumDynamicLocalLoadsFound += BB->getExecutionCount();
1216 
1217       if (MIB->replaceMemOperandWithImm(Inst, ConstantData, Offset)) {
1218         ++NumLocalLoadsSimplified;
1219         if (BB->hasProfile())
1220           NumDynamicLocalLoadsSimplified += BB->getExecutionCount();
1221       }
1222     }
1223   }
1224 
1225   NumLoadsFound += NumLocalLoadsFound;
1226   NumDynamicLoadsFound += NumDynamicLocalLoadsFound;
1227   NumLoadsSimplified += NumLocalLoadsSimplified;
1228   NumDynamicLoadsSimplified += NumDynamicLocalLoadsSimplified;
1229 
1230   return NumLocalLoadsSimplified > 0;
1231 }
1232 
1233 Error SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) {
1234   for (auto &It : BC.getBinaryFunctions()) {
1235     BinaryFunction &Function = It.second;
1236     if (shouldOptimize(Function) && simplifyRODataLoads(Function))
1237       Modified.insert(&Function);
1238   }
1239 
1240   BC.outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of "
1241             << NumLoadsFound << " loads from a statically computed address.\n"
1242             << "BOLT-INFO: dynamic loads simplified: "
1243             << NumDynamicLoadsSimplified << "\n"
1244             << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound
1245             << "\n";
1246   return Error::success();
1247 }
1248 
1249 Error AssignSections::runOnFunctions(BinaryContext &BC) {
1250   for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) {
1251     Function->setCodeSectionName(BC.getInjectedCodeSectionName());
1252     Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName());
1253   }
1254 
1255   // In non-relocation mode functions have pre-assigned section names.
1256   if (!BC.HasRelocations)
1257     return Error::success();
1258 
1259   const bool UseColdSection =
1260       BC.NumProfiledFuncs > 0 ||
1261       opts::ReorderFunctions == ReorderFunctions::RT_USER;
1262   for (auto &BFI : BC.getBinaryFunctions()) {
1263     BinaryFunction &Function = BFI.second;
1264     if (opts::isHotTextMover(Function)) {
1265       Function.setCodeSectionName(BC.getHotTextMoverSectionName());
1266       Function.setColdCodeSectionName(BC.getHotTextMoverSectionName());
1267       continue;
1268     }
1269 
1270     if (!UseColdSection || Function.hasValidIndex())
1271       Function.setCodeSectionName(BC.getMainCodeSectionName());
1272     else
1273       Function.setCodeSectionName(BC.getColdCodeSectionName());
1274 
1275     if (Function.isSplit())
1276       Function.setColdCodeSectionName(BC.getColdCodeSectionName());
1277   }
1278   return Error::success();
1279 }
1280 
1281 Error PrintProfileStats::runOnFunctions(BinaryContext &BC) {
1282   double FlowImbalanceMean = 0.0;
1283   size_t NumBlocksConsidered = 0;
1284   double WorstBias = 0.0;
1285   const BinaryFunction *WorstBiasFunc = nullptr;
1286 
1287   // For each function CFG, we fill an IncomingMap with the sum of the frequency
1288   // of incoming edges for each BB. Likewise for each OutgoingMap and the sum
1289   // of the frequency of outgoing edges.
1290   using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>;
1291   std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps;
1292   std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps;
1293 
1294   // Compute mean
1295   for (const auto &BFI : BC.getBinaryFunctions()) {
1296     const BinaryFunction &Function = BFI.second;
1297     if (Function.empty() || !Function.isSimple())
1298       continue;
1299     FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
1300     FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
1301     for (const BinaryBasicBlock &BB : Function) {
1302       uint64_t TotalOutgoing = 0ULL;
1303       auto SuccBIIter = BB.branch_info_begin();
1304       for (BinaryBasicBlock *Succ : BB.successors()) {
1305         uint64_t Count = SuccBIIter->Count;
1306         if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) {
1307           ++SuccBIIter;
1308           continue;
1309         }
1310         TotalOutgoing += Count;
1311         IncomingMap[Succ] += Count;
1312         ++SuccBIIter;
1313       }
1314       OutgoingMap[&BB] = TotalOutgoing;
1315     }
1316 
1317     size_t NumBlocks = 0;
1318     double Mean = 0.0;
1319     for (const BinaryBasicBlock &BB : Function) {
1320       // Do not compute score for low frequency blocks, entry or exit blocks
1321       if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint())
1322         continue;
1323       ++NumBlocks;
1324       const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
1325       Mean += fabs(Difference / IncomingMap[&BB]);
1326     }
1327 
1328     FlowImbalanceMean += Mean;
1329     NumBlocksConsidered += NumBlocks;
1330     if (!NumBlocks)
1331       continue;
1332     double FuncMean = Mean / NumBlocks;
1333     if (FuncMean > WorstBias) {
1334       WorstBias = FuncMean;
1335       WorstBiasFunc = &Function;
1336     }
1337   }
1338   if (NumBlocksConsidered > 0)
1339     FlowImbalanceMean /= NumBlocksConsidered;
1340 
1341   // Compute standard deviation
1342   NumBlocksConsidered = 0;
1343   double FlowImbalanceVar = 0.0;
1344   for (const auto &BFI : BC.getBinaryFunctions()) {
1345     const BinaryFunction &Function = BFI.second;
1346     if (Function.empty() || !Function.isSimple())
1347       continue;
1348     FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
1349     FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
1350     for (const BinaryBasicBlock &BB : Function) {
1351       if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0)
1352         continue;
1353       ++NumBlocksConsidered;
1354       const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
1355       FlowImbalanceVar +=
1356           pow(fabs(Difference / IncomingMap[&BB]) - FlowImbalanceMean, 2);
1357     }
1358   }
1359   if (NumBlocksConsidered) {
1360     FlowImbalanceVar /= NumBlocksConsidered;
1361     FlowImbalanceVar = sqrt(FlowImbalanceVar);
1362   }
1363 
1364   // Report to user
1365   BC.outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n",
1366                       (100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar));
1367   if (WorstBiasFunc && opts::Verbosity >= 1) {
1368     BC.outs() << "Worst average bias observed in "
1369               << WorstBiasFunc->getPrintName() << "\n";
1370     LLVM_DEBUG(WorstBiasFunc->dump());
1371   }
1372   return Error::success();
1373 }
1374 
1375 Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
1376   uint64_t NumRegularFunctions = 0;
1377   uint64_t NumStaleProfileFunctions = 0;
1378   uint64_t NumAllStaleFunctions = 0;
1379   uint64_t NumInferredFunctions = 0;
1380   uint64_t NumNonSimpleProfiledFunctions = 0;
1381   uint64_t NumUnknownControlFlowFunctions = 0;
1382   uint64_t TotalSampleCount = 0;
1383   uint64_t StaleSampleCount = 0;
1384   uint64_t InferredSampleCount = 0;
1385   std::vector<const BinaryFunction *> ProfiledFunctions;
1386   const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n";
1387   for (auto &BFI : BC.getBinaryFunctions()) {
1388     const BinaryFunction &Function = BFI.second;
1389 
1390     // Ignore PLT functions for stats.
1391     if (Function.isPLTFunction())
1392       continue;
1393 
1394     // Adjustment for BAT mode: the profile for BOLT split fragments is combined
1395     // so only count the hot fragment.
1396     const uint64_t Address = Function.getAddress();
1397     bool IsHotParentOfBOLTSplitFunction = !Function.getFragments().empty() &&
1398                                           BAT && BAT->isBATFunction(Address) &&
1399                                           !BAT->fetchParentAddress(Address);
1400 
1401     ++NumRegularFunctions;
1402 
1403     // In BOLTed binaries split functions are non-simple (due to non-relocation
1404     // mode), but the original function is known to be simple and we have a
1405     // valid profile for it.
1406     if (!Function.isSimple() && !IsHotParentOfBOLTSplitFunction) {
1407       if (Function.hasProfile())
1408         ++NumNonSimpleProfiledFunctions;
1409       continue;
1410     }
1411 
1412     if (Function.hasUnknownControlFlow()) {
1413       if (opts::PrintUnknownCFG)
1414         Function.dump();
1415       else if (opts::PrintUnknown)
1416         BC.errs() << "function with unknown control flow: " << Function << '\n';
1417 
1418       ++NumUnknownControlFlowFunctions;
1419     }
1420 
1421     if (!Function.hasProfile())
1422       continue;
1423 
1424     uint64_t SampleCount = Function.getRawBranchCount();
1425     TotalSampleCount += SampleCount;
1426 
1427     if (Function.hasValidProfile()) {
1428       ProfiledFunctions.push_back(&Function);
1429       if (Function.hasInferredProfile()) {
1430         ++NumInferredFunctions;
1431         InferredSampleCount += SampleCount;
1432         ++NumAllStaleFunctions;
1433       }
1434     } else {
1435       if (opts::ReportStaleFuncs) {
1436         BC.outs() << StaleFuncsHeader;
1437         StaleFuncsHeader = "";
1438         BC.outs() << "  " << Function << '\n';
1439       }
1440       ++NumStaleProfileFunctions;
1441       StaleSampleCount += SampleCount;
1442       ++NumAllStaleFunctions;
1443     }
1444   }
1445   BC.NumProfiledFuncs = ProfiledFunctions.size();
1446   BC.NumStaleProfileFuncs = NumStaleProfileFunctions;
1447 
1448   const size_t NumAllProfiledFunctions =
1449       ProfiledFunctions.size() + NumStaleProfileFunctions;
1450   BC.outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of "
1451             << NumRegularFunctions << " functions in the binary ("
1452             << format("%.1f", NumAllProfiledFunctions /
1453                                   (float)NumRegularFunctions * 100.0f)
1454             << "%) have non-empty execution profile\n";
1455   if (NumNonSimpleProfiledFunctions) {
1456     BC.outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function"
1457               << (NumNonSimpleProfiledFunctions == 1 ? "" : "s")
1458               << " with profile could not be optimized\n";
1459   }
1460   if (NumAllStaleFunctions) {
1461     const float PctStale =
1462         NumAllStaleFunctions / (float)NumAllProfiledFunctions * 100.0f;
1463     const float PctStaleFuncsWithEqualBlockCount =
1464         (float)BC.Stats.NumStaleFuncsWithEqualBlockCount /
1465         NumAllStaleFunctions * 100.0f;
1466     const float PctStaleBlocksWithEqualIcount =
1467         (float)BC.Stats.NumStaleBlocksWithEqualIcount /
1468         BC.Stats.NumStaleBlocks * 100.0f;
1469     auto printErrorOrWarning = [&]() {
1470       if (PctStale > opts::StaleThreshold)
1471         BC.errs() << "BOLT-ERROR: ";
1472       else
1473         BC.errs() << "BOLT-WARNING: ";
1474     };
1475     printErrorOrWarning();
1476     BC.errs() << NumAllStaleFunctions
1477               << format(" (%.1f%% of all profiled)", PctStale) << " function"
1478               << (NumAllStaleFunctions == 1 ? "" : "s")
1479               << " have invalid (possibly stale) profile."
1480                  " Use -report-stale to see the list.\n";
1481     if (TotalSampleCount > 0) {
1482       printErrorOrWarning();
1483       BC.errs() << (StaleSampleCount + InferredSampleCount) << " out of "
1484                 << TotalSampleCount << " samples in the binary ("
1485                 << format("%.1f",
1486                           ((100.0f * (StaleSampleCount + InferredSampleCount)) /
1487                            TotalSampleCount))
1488                 << "%) belong to functions with invalid"
1489                    " (possibly stale) profile.\n";
1490     }
1491     BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleFuncsWithEqualBlockCount
1492               << " stale function"
1493               << (BC.Stats.NumStaleFuncsWithEqualBlockCount == 1 ? "" : "s")
1494               << format(" (%.1f%% of all stale)",
1495                         PctStaleFuncsWithEqualBlockCount)
1496               << " have matching block count.\n";
1497     BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleBlocksWithEqualIcount
1498               << " stale block"
1499               << (BC.Stats.NumStaleBlocksWithEqualIcount == 1 ? "" : "s")
1500               << format(" (%.1f%% of all stale)", PctStaleBlocksWithEqualIcount)
1501               << " have matching icount.\n";
1502     if (PctStale > opts::StaleThreshold) {
1503       return createFatalBOLTError(
1504           Twine("BOLT-ERROR: stale functions exceed specified threshold of ") +
1505           Twine(opts::StaleThreshold.getValue()) + Twine("%. Exiting.\n"));
1506     }
1507   }
1508   if (NumInferredFunctions) {
1509     BC.outs() << format(
1510         "BOLT-INFO: inferred profile for %d (%.2f%% of profiled, "
1511         "%.2f%% of stale) functions responsible for %.2f%% samples"
1512         " (%zu out of %zu)\n",
1513         NumInferredFunctions,
1514         100.0 * NumInferredFunctions / NumAllProfiledFunctions,
1515         100.0 * NumInferredFunctions / NumAllStaleFunctions,
1516         100.0 * InferredSampleCount / TotalSampleCount, InferredSampleCount,
1517         TotalSampleCount);
1518     BC.outs() << format(
1519         "BOLT-INFO: inference found an exact match for %.2f%% of basic blocks"
1520         " (%zu out of %zu stale) responsible for %.2f%% samples"
1521         " (%zu out of %zu stale)\n",
1522         100.0 * BC.Stats.NumMatchedBlocks / BC.Stats.NumStaleBlocks,
1523         BC.Stats.NumMatchedBlocks, BC.Stats.NumStaleBlocks,
1524         100.0 * BC.Stats.MatchedSampleCount / BC.Stats.StaleSampleCount,
1525         BC.Stats.MatchedSampleCount, BC.Stats.StaleSampleCount);
1526   }
1527 
1528   if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) {
1529     BC.outs() << "BOLT-INFO: profile for " << NumUnusedObjects
1530               << " objects was ignored\n";
1531   }
1532 
1533   if (ProfiledFunctions.size() > 10) {
1534     if (opts::Verbosity >= 1) {
1535       BC.outs() << "BOLT-INFO: top called functions are:\n";
1536       llvm::sort(ProfiledFunctions,
1537                  [](const BinaryFunction *A, const BinaryFunction *B) {
1538                    return B->getExecutionCount() < A->getExecutionCount();
1539                  });
1540       auto SFI = ProfiledFunctions.begin();
1541       auto SFIend = ProfiledFunctions.end();
1542       for (unsigned I = 0u; I < opts::TopCalledLimit && SFI != SFIend;
1543            ++SFI, ++I)
1544         BC.outs() << "  " << **SFI << " : " << (*SFI)->getExecutionCount()
1545                   << '\n';
1546     }
1547   }
1548 
1549   if (!opts::PrintSortedBy.empty()) {
1550     std::vector<BinaryFunction *> Functions;
1551     std::map<const BinaryFunction *, DynoStats> Stats;
1552 
1553     for (auto &BFI : BC.getBinaryFunctions()) {
1554       BinaryFunction &BF = BFI.second;
1555       if (shouldOptimize(BF) && BF.hasValidProfile()) {
1556         Functions.push_back(&BF);
1557         Stats.emplace(&BF, getDynoStats(BF));
1558       }
1559     }
1560 
1561     const bool SortAll =
1562         llvm::is_contained(opts::PrintSortedBy, DynoStats::LAST_DYNO_STAT);
1563 
1564     const bool Ascending =
1565         opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending;
1566 
1567     std::function<bool(const DynoStats &, const DynoStats &)>
1568         DynoStatsComparator =
1569             SortAll ? [](const DynoStats &StatsA,
1570                          const DynoStats &StatsB) { return StatsA < StatsB; }
1571                     : [](const DynoStats &StatsA, const DynoStats &StatsB) {
1572                         return StatsA.lessThan(StatsB, opts::PrintSortedBy);
1573                       };
1574 
1575     llvm::stable_sort(Functions,
1576                       [Ascending, &Stats, DynoStatsComparator](
1577                           const BinaryFunction *A, const BinaryFunction *B) {
1578                         auto StatsItr = Stats.find(A);
1579                         assert(StatsItr != Stats.end());
1580                         const DynoStats &StatsA = StatsItr->second;
1581 
1582                         StatsItr = Stats.find(B);
1583                         assert(StatsItr != Stats.end());
1584                         const DynoStats &StatsB = StatsItr->second;
1585 
1586                         return Ascending ? DynoStatsComparator(StatsA, StatsB)
1587                                          : DynoStatsComparator(StatsB, StatsA);
1588                       });
1589 
1590     BC.outs() << "BOLT-INFO: top functions sorted by ";
1591     if (SortAll) {
1592       BC.outs() << "dyno stats";
1593     } else {
1594       BC.outs() << "(";
1595       bool PrintComma = false;
1596       for (const DynoStats::Category Category : opts::PrintSortedBy) {
1597         if (PrintComma)
1598           BC.outs() << ", ";
1599         BC.outs() << DynoStats::Description(Category);
1600         PrintComma = true;
1601       }
1602       BC.outs() << ")";
1603     }
1604 
1605     BC.outs() << " are:\n";
1606     auto SFI = Functions.begin();
1607     for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) {
1608       const DynoStats Stats = getDynoStats(**SFI);
1609       BC.outs() << "  " << **SFI;
1610       if (!SortAll) {
1611         BC.outs() << " (";
1612         bool PrintComma = false;
1613         for (const DynoStats::Category Category : opts::PrintSortedBy) {
1614           if (PrintComma)
1615             BC.outs() << ", ";
1616           BC.outs() << dynoStatsOptName(Category) << "=" << Stats[Category];
1617           PrintComma = true;
1618         }
1619         BC.outs() << ")";
1620       }
1621       BC.outs() << "\n";
1622     }
1623   }
1624 
1625   if (!BC.TrappedFunctions.empty()) {
1626     BC.errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function"
1627               << (BC.TrappedFunctions.size() > 1 ? "s" : "")
1628               << " will trap on entry. Use -trap-avx512=0 to disable"
1629                  " traps.";
1630     if (opts::Verbosity >= 1 || BC.TrappedFunctions.size() <= 5) {
1631       BC.errs() << '\n';
1632       for (const BinaryFunction *Function : BC.TrappedFunctions)
1633         BC.errs() << "  " << *Function << '\n';
1634     } else {
1635       BC.errs() << " Use -v=1 to see the list.\n";
1636     }
1637   }
1638 
1639   // Collect and print information about suboptimal code layout on input.
1640   if (opts::ReportBadLayout) {
1641     std::vector<BinaryFunction *> SuboptimalFuncs;
1642     for (auto &BFI : BC.getBinaryFunctions()) {
1643       BinaryFunction &BF = BFI.second;
1644       if (!BF.hasValidProfile())
1645         continue;
1646 
1647       const uint64_t HotThreshold =
1648           std::max<uint64_t>(BF.getKnownExecutionCount(), 1);
1649       bool HotSeen = false;
1650       for (const BinaryBasicBlock *BB : BF.getLayout().rblocks()) {
1651         if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) {
1652           HotSeen = true;
1653           continue;
1654         }
1655         if (HotSeen && BB->getKnownExecutionCount() == 0) {
1656           SuboptimalFuncs.push_back(&BF);
1657           break;
1658         }
1659       }
1660     }
1661 
1662     if (!SuboptimalFuncs.empty()) {
1663       llvm::sort(SuboptimalFuncs,
1664                  [](const BinaryFunction *A, const BinaryFunction *B) {
1665                    return A->getKnownExecutionCount() / A->getSize() >
1666                           B->getKnownExecutionCount() / B->getSize();
1667                  });
1668 
1669       BC.outs() << "BOLT-INFO: " << SuboptimalFuncs.size()
1670                 << " functions have "
1671                    "cold code in the middle of hot code. Top functions are:\n";
1672       for (unsigned I = 0;
1673            I < std::min(static_cast<size_t>(opts::ReportBadLayout),
1674                         SuboptimalFuncs.size());
1675            ++I)
1676         SuboptimalFuncs[I]->print(BC.outs());
1677     }
1678   }
1679 
1680   if (NumUnknownControlFlowFunctions) {
1681     BC.outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions
1682               << " functions have instructions with unknown control flow";
1683     if (!opts::PrintUnknown)
1684       BC.outs() << ". Use -print-unknown to see the list.";
1685     BC.outs() << '\n';
1686   }
1687   return Error::success();
1688 }
1689 
1690 Error InstructionLowering::runOnFunctions(BinaryContext &BC) {
1691   for (auto &BFI : BC.getBinaryFunctions())
1692     for (BinaryBasicBlock &BB : BFI.second)
1693       for (MCInst &Instruction : BB)
1694         BC.MIB->lowerTailCall(Instruction);
1695   return Error::success();
1696 }
1697 
1698 Error StripRepRet::runOnFunctions(BinaryContext &BC) {
1699   if (!BC.isX86())
1700     return Error::success();
1701 
1702   uint64_t NumPrefixesRemoved = 0;
1703   uint64_t NumBytesSaved = 0;
1704   for (auto &BFI : BC.getBinaryFunctions()) {
1705     for (BinaryBasicBlock &BB : BFI.second) {
1706       auto LastInstRIter = BB.getLastNonPseudo();
1707       if (LastInstRIter == BB.rend() || !BC.MIB->isReturn(*LastInstRIter) ||
1708           !BC.MIB->deleteREPPrefix(*LastInstRIter))
1709         continue;
1710 
1711       NumPrefixesRemoved += BB.getKnownExecutionCount();
1712       ++NumBytesSaved;
1713     }
1714   }
1715 
1716   if (NumBytesSaved)
1717     BC.outs() << "BOLT-INFO: removed " << NumBytesSaved
1718               << " 'repz' prefixes"
1719                  " with estimated execution count of "
1720               << NumPrefixesRemoved << " times.\n";
1721   return Error::success();
1722 }
1723 
1724 Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
1725   if (!BC.isX86())
1726     return Error::success();
1727 
1728   uint64_t NumInlined = 0;
1729   uint64_t NumInlinedDyno = 0;
1730   for (auto &BFI : BC.getBinaryFunctions()) {
1731     for (BinaryBasicBlock &BB : BFI.second) {
1732       for (auto II = BB.begin(); II != BB.end(); ++II) {
1733         MCInst &Inst = *II;
1734 
1735         if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
1736             !Inst.getOperand(0).isExpr())
1737           continue;
1738 
1739         const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
1740         if (CalleeSymbol->getName() != "memcpy" &&
1741             CalleeSymbol->getName() != "memcpy@PLT" &&
1742             CalleeSymbol->getName() != "_memcpy8")
1743           continue;
1744 
1745         const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
1746         const bool IsTailCall = BC.MIB->isTailCall(Inst);
1747 
1748         const InstructionListType NewCode =
1749             BC.MIB->createInlineMemcpy(IsMemcpy8);
1750         II = BB.replaceInstruction(II, NewCode);
1751         std::advance(II, NewCode.size() - 1);
1752         if (IsTailCall) {
1753           MCInst Return;
1754           BC.MIB->createReturn(Return);
1755           II = BB.insertInstruction(std::next(II), std::move(Return));
1756         }
1757 
1758         ++NumInlined;
1759         NumInlinedDyno += BB.getKnownExecutionCount();
1760       }
1761     }
1762   }
1763 
1764   if (NumInlined) {
1765     BC.outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls";
1766     if (NumInlinedDyno)
1767       BC.outs() << ". The calls were executed " << NumInlinedDyno
1768                 << " times based on profile.";
1769     BC.outs() << '\n';
1770   }
1771   return Error::success();
1772 }
1773 
1774 bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const {
1775   if (!BinaryFunctionPass::shouldOptimize(Function))
1776     return false;
1777 
1778   for (const std::string &FunctionSpec : Spec) {
1779     StringRef FunctionName = StringRef(FunctionSpec).split(':').first;
1780     if (Function.hasNameRegex(FunctionName))
1781       return true;
1782   }
1783 
1784   return false;
1785 }
1786 
1787 std::set<size_t> SpecializeMemcpy1::getCallSitesToOptimize(
1788     const BinaryFunction &Function) const {
1789   StringRef SitesString;
1790   for (const std::string &FunctionSpec : Spec) {
1791     StringRef FunctionName;
1792     std::tie(FunctionName, SitesString) = StringRef(FunctionSpec).split(':');
1793     if (Function.hasNameRegex(FunctionName))
1794       break;
1795     SitesString = "";
1796   }
1797 
1798   std::set<size_t> Sites;
1799   SmallVector<StringRef, 4> SitesVec;
1800   SitesString.split(SitesVec, ':');
1801   for (StringRef SiteString : SitesVec) {
1802     if (SiteString.empty())
1803       continue;
1804     size_t Result;
1805     if (!SiteString.getAsInteger(10, Result))
1806       Sites.emplace(Result);
1807   }
1808 
1809   return Sites;
1810 }
1811 
1812 Error SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) {
1813   if (!BC.isX86())
1814     return Error::success();
1815 
1816   uint64_t NumSpecialized = 0;
1817   uint64_t NumSpecializedDyno = 0;
1818   for (auto &BFI : BC.getBinaryFunctions()) {
1819     BinaryFunction &Function = BFI.second;
1820     if (!shouldOptimize(Function))
1821       continue;
1822 
1823     std::set<size_t> CallsToOptimize = getCallSitesToOptimize(Function);
1824     auto shouldOptimize = [&](size_t N) {
1825       return CallsToOptimize.empty() || CallsToOptimize.count(N);
1826     };
1827 
1828     std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend());
1829     size_t CallSiteID = 0;
1830     for (BinaryBasicBlock *CurBB : Blocks) {
1831       for (auto II = CurBB->begin(); II != CurBB->end(); ++II) {
1832         MCInst &Inst = *II;
1833 
1834         if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
1835             !Inst.getOperand(0).isExpr())
1836           continue;
1837 
1838         const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
1839         if (CalleeSymbol->getName() != "memcpy" &&
1840             CalleeSymbol->getName() != "memcpy@PLT")
1841           continue;
1842 
1843         if (BC.MIB->isTailCall(Inst))
1844           continue;
1845 
1846         ++CallSiteID;
1847 
1848         if (!shouldOptimize(CallSiteID))
1849           continue;
1850 
1851         // Create a copy of a call to memcpy(dest, src, size).
1852         MCInst MemcpyInstr = Inst;
1853 
1854         BinaryBasicBlock *OneByteMemcpyBB = CurBB->splitAt(II);
1855 
1856         BinaryBasicBlock *NextBB = nullptr;
1857         if (OneByteMemcpyBB->getNumNonPseudos() > 1) {
1858           NextBB = OneByteMemcpyBB->splitAt(OneByteMemcpyBB->begin());
1859           NextBB->eraseInstruction(NextBB->begin());
1860         } else {
1861           NextBB = OneByteMemcpyBB->getSuccessor();
1862           OneByteMemcpyBB->eraseInstruction(OneByteMemcpyBB->begin());
1863           assert(NextBB && "unexpected call to memcpy() with no return");
1864         }
1865 
1866         BinaryBasicBlock *MemcpyBB = Function.addBasicBlock();
1867         MemcpyBB->setOffset(CurBB->getInputOffset());
1868         InstructionListType CmpJCC =
1869             BC.MIB->createCmpJE(BC.MIB->getIntArgRegister(2), 1,
1870                                 OneByteMemcpyBB->getLabel(), BC.Ctx.get());
1871         CurBB->addInstructions(CmpJCC);
1872         CurBB->addSuccessor(MemcpyBB);
1873 
1874         MemcpyBB->addInstruction(std::move(MemcpyInstr));
1875         MemcpyBB->addSuccessor(NextBB);
1876         MemcpyBB->setCFIState(NextBB->getCFIState());
1877         MemcpyBB->setExecutionCount(0);
1878 
1879         // To prevent the actual call from being moved to cold, we set its
1880         // execution count to 1.
1881         if (CurBB->getKnownExecutionCount() > 0)
1882           MemcpyBB->setExecutionCount(1);
1883 
1884         InstructionListType OneByteMemcpy = BC.MIB->createOneByteMemcpy();
1885         OneByteMemcpyBB->addInstructions(OneByteMemcpy);
1886 
1887         ++NumSpecialized;
1888         NumSpecializedDyno += CurBB->getKnownExecutionCount();
1889 
1890         CurBB = NextBB;
1891 
1892         // Note: we don't expect the next instruction to be a call to memcpy.
1893         II = CurBB->begin();
1894       }
1895     }
1896   }
1897 
1898   if (NumSpecialized) {
1899     BC.outs() << "BOLT-INFO: specialized " << NumSpecialized
1900               << " memcpy() call sites for size 1";
1901     if (NumSpecializedDyno)
1902       BC.outs() << ". The calls were executed " << NumSpecializedDyno
1903                 << " times based on profile.";
1904     BC.outs() << '\n';
1905   }
1906   return Error::success();
1907 }
1908 
1909 void RemoveNops::runOnFunction(BinaryFunction &BF) {
1910   const BinaryContext &BC = BF.getBinaryContext();
1911   for (BinaryBasicBlock &BB : BF) {
1912     for (int64_t I = BB.size() - 1; I >= 0; --I) {
1913       MCInst &Inst = BB.getInstructionAtIndex(I);
1914       if (BC.MIB->isNoop(Inst) && BC.MIB->hasAnnotation(Inst, "NOP"))
1915         BB.eraseInstructionAtIndex(I);
1916     }
1917   }
1918 }
1919 
1920 Error RemoveNops::runOnFunctions(BinaryContext &BC) {
1921   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
1922     runOnFunction(BF);
1923   };
1924 
1925   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
1926     return BF.shouldPreserveNops();
1927   };
1928 
1929   ParallelUtilities::runOnEachFunction(
1930       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
1931       SkipFunc, "RemoveNops");
1932   return Error::success();
1933 }
1934 
1935 } // namespace bolt
1936 } // namespace llvm
1937