xref: /llvm-project/bolt/lib/Passes/BinaryPasses.cpp (revision c1a7c5ac73af92316426deed5f8f10f33f729ad2)
1 //===- bolt/Passes/BinaryPasses.cpp - Binary-level passes -----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements multiple passes for binary optimization and analysis.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "bolt/Passes/BinaryPasses.h"
14 #include "bolt/Core/FunctionLayout.h"
15 #include "bolt/Core/ParallelUtilities.h"
16 #include "bolt/Passes/ReorderAlgorithm.h"
17 #include "bolt/Passes/ReorderFunctions.h"
18 #include "llvm/Support/CommandLine.h"
19 #include <atomic>
20 #include <mutex>
21 #include <numeric>
22 #include <vector>
23 
24 #define DEBUG_TYPE "bolt-opts"
25 
26 using namespace llvm;
27 using namespace bolt;
28 
29 static const char *dynoStatsOptName(const bolt::DynoStats::Category C) {
30   assert(C > bolt::DynoStats::FIRST_DYNO_STAT &&
31          C < DynoStats::LAST_DYNO_STAT && "Unexpected dyno stat category.");
32 
33   static std::string OptNames[bolt::DynoStats::LAST_DYNO_STAT + 1];
34 
35   OptNames[C] = bolt::DynoStats::Description(C);
36 
37   std::replace(OptNames[C].begin(), OptNames[C].end(), ' ', '-');
38 
39   return OptNames[C].c_str();
40 }
41 
42 namespace opts {
43 
44 extern cl::OptionCategory BoltCategory;
45 extern cl::OptionCategory BoltOptCategory;
46 
47 extern cl::opt<bolt::MacroFusionType> AlignMacroOpFusion;
48 extern cl::opt<unsigned> Verbosity;
49 extern cl::opt<bool> EnableBAT;
50 extern cl::opt<unsigned> ExecutionCountThreshold;
51 extern cl::opt<bool> UpdateDebugSections;
52 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions;
53 
54 enum DynoStatsSortOrder : char {
55   Ascending,
56   Descending
57 };
58 
59 static cl::opt<DynoStatsSortOrder> DynoStatsSortOrderOpt(
60     "print-sorted-by-order",
61     cl::desc("use ascending or descending order when printing functions "
62              "ordered by dyno stats"),
63     cl::init(DynoStatsSortOrder::Descending), cl::cat(BoltOptCategory));
64 
65 cl::list<std::string>
66 HotTextMoveSections("hot-text-move-sections",
67   cl::desc("list of sections containing functions used for hugifying hot text. "
68            "BOLT makes sure these functions are not placed on the same page as "
69            "the hot text. (default=\'.stub,.mover\')."),
70   cl::value_desc("sec1,sec2,sec3,..."),
71   cl::CommaSeparated,
72   cl::ZeroOrMore,
73   cl::cat(BoltCategory));
74 
75 bool isHotTextMover(const BinaryFunction &Function) {
76   for (std::string &SectionName : opts::HotTextMoveSections) {
77     if (Function.getOriginSectionName() &&
78         *Function.getOriginSectionName() == SectionName)
79       return true;
80   }
81 
82   return false;
83 }
84 
85 static cl::opt<bool> MinBranchClusters(
86     "min-branch-clusters",
87     cl::desc("use a modified clustering algorithm geared towards minimizing "
88              "branches"),
89     cl::Hidden, cl::cat(BoltOptCategory));
90 
91 static cl::list<Peepholes::PeepholeOpts> Peepholes(
92     "peepholes", cl::CommaSeparated, cl::desc("enable peephole optimizations"),
93     cl::value_desc("opt1,opt2,opt3,..."),
94     cl::values(clEnumValN(Peepholes::PEEP_NONE, "none", "disable peepholes"),
95                clEnumValN(Peepholes::PEEP_DOUBLE_JUMPS, "double-jumps",
96                           "remove double jumps when able"),
97                clEnumValN(Peepholes::PEEP_TAILCALL_TRAPS, "tailcall-traps",
98                           "insert tail call traps"),
99                clEnumValN(Peepholes::PEEP_USELESS_BRANCHES, "useless-branches",
100                           "remove useless conditional branches"),
101                clEnumValN(Peepholes::PEEP_ALL, "all",
102                           "enable all peephole optimizations")),
103     cl::ZeroOrMore, cl::cat(BoltOptCategory));
104 
105 static cl::opt<unsigned>
106     PrintFuncStat("print-function-statistics",
107                   cl::desc("print statistics about basic block ordering"),
108                   cl::init(0), cl::cat(BoltOptCategory));
109 
110 static cl::opt<bool> PrintLargeFunctions(
111     "print-large-functions",
112     cl::desc("print functions that could not be overwritten due to excessive "
113              "size"),
114     cl::init(false), cl::cat(BoltOptCategory));
115 
116 static cl::list<bolt::DynoStats::Category>
117     PrintSortedBy("print-sorted-by", cl::CommaSeparated,
118                   cl::desc("print functions sorted by order of dyno stats"),
119                   cl::value_desc("key1,key2,key3,..."),
120                   cl::values(
121 #define D(name, description, ...)                                              \
122   clEnumValN(bolt::DynoStats::name, dynoStatsOptName(bolt::DynoStats::name),   \
123              description),
124                       REAL_DYNO_STATS
125 #undef D
126                           clEnumValN(bolt::DynoStats::LAST_DYNO_STAT, "all",
127                                      "sorted by all names")),
128                   cl::ZeroOrMore, cl::cat(BoltOptCategory));
129 
130 static cl::opt<bool>
131     PrintUnknown("print-unknown",
132                  cl::desc("print names of functions with unknown control flow"),
133                  cl::cat(BoltCategory), cl::Hidden);
134 
135 static cl::opt<bool>
136     PrintUnknownCFG("print-unknown-cfg",
137                     cl::desc("dump CFG of functions with unknown control flow"),
138                     cl::cat(BoltCategory), cl::ReallyHidden);
139 
140 // Please MSVC19 with a forward declaration: otherwise it reports an error about
141 // an undeclared variable inside a callback.
142 extern cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks;
143 cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks(
144     "reorder-blocks", cl::desc("change layout of basic blocks in a function"),
145     cl::init(bolt::ReorderBasicBlocks::LT_NONE),
146     cl::values(
147         clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, "none",
148                    "do not reorder basic blocks"),
149         clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, "reverse",
150                    "layout blocks in reverse order"),
151         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, "normal",
152                    "perform optimal layout based on profile"),
153         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH,
154                    "branch-predictor",
155                    "perform optimal layout prioritizing branch "
156                    "predictions"),
157         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, "cache",
158                    "perform optimal layout prioritizing I-cache "
159                    "behavior"),
160         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS, "cache+",
161                    "perform layout optimizing I-cache behavior"),
162         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "ext-tsp",
163                    "perform layout optimizing I-cache behavior"),
164         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE,
165                    "cluster-shuffle", "perform random layout of clusters")),
166     cl::ZeroOrMore, cl::cat(BoltOptCategory),
167     cl::callback([](const bolt::ReorderBasicBlocks::LayoutType &option) {
168       if (option == bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS) {
169         errs() << "BOLT-WARNING: '-reorder-blocks=cache+' is deprecated, please"
170                << " use '-reorder-blocks=ext-tsp' instead\n";
171         ReorderBlocks = bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP;
172       }
173     }));
174 
175 static cl::opt<unsigned> ReportBadLayout(
176     "report-bad-layout",
177     cl::desc("print top <uint> functions with suboptimal code layout on input"),
178     cl::init(0), cl::Hidden, cl::cat(BoltOptCategory));
179 
180 static cl::opt<bool>
181     ReportStaleFuncs("report-stale",
182                      cl::desc("print the list of functions with stale profile"),
183                      cl::Hidden, cl::cat(BoltOptCategory));
184 
185 enum SctcModes : char {
186   SctcAlways,
187   SctcPreserveDirection,
188   SctcHeuristic
189 };
190 
191 static cl::opt<SctcModes>
192 SctcMode("sctc-mode",
193   cl::desc("mode for simplify conditional tail calls"),
194   cl::init(SctcAlways),
195   cl::values(clEnumValN(SctcAlways, "always", "always perform sctc"),
196     clEnumValN(SctcPreserveDirection,
197       "preserve",
198       "only perform sctc when branch direction is "
199       "preserved"),
200     clEnumValN(SctcHeuristic,
201       "heuristic",
202       "use branch prediction data to control sctc")),
203   cl::ZeroOrMore,
204   cl::cat(BoltOptCategory));
205 
206 static cl::opt<unsigned>
207 StaleThreshold("stale-threshold",
208     cl::desc(
209       "maximum percentage of stale functions to tolerate (default: 100)"),
210     cl::init(100),
211     cl::Hidden,
212     cl::cat(BoltOptCategory));
213 
214 static cl::opt<unsigned> TSPThreshold(
215     "tsp-threshold",
216     cl::desc(
217         "maximum number of hot basic blocks in a function for which to use "
218         "a precise TSP solution while re-ordering basic blocks"),
219     cl::init(10), cl::Hidden, cl::cat(BoltOptCategory));
220 
221 static cl::opt<unsigned> TopCalledLimit(
222     "top-called-limit",
223     cl::desc("maximum number of functions to print in top called "
224              "functions section"),
225     cl::init(100), cl::Hidden, cl::cat(BoltCategory));
226 
227 } // namespace opts
228 
229 namespace llvm {
230 namespace bolt {
231 
232 bool BinaryFunctionPass::shouldOptimize(const BinaryFunction &BF) const {
233   return BF.isSimple() && BF.getState() == BinaryFunction::State::CFG &&
234          !BF.isIgnored();
235 }
236 
237 bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const {
238   return BF.isSimple() && !BF.isIgnored();
239 }
240 
241 void NormalizeCFG::runOnFunction(BinaryFunction &BF) {
242   uint64_t NumRemoved = 0;
243   uint64_t NumDuplicateEdges = 0;
244   uint64_t NeedsFixBranches = 0;
245   for (BinaryBasicBlock &BB : BF) {
246     if (!BB.empty())
247       continue;
248 
249     if (BB.isEntryPoint() || BB.isLandingPad())
250       continue;
251 
252     // Handle a dangling empty block.
253     if (BB.succ_size() == 0) {
254       // If an empty dangling basic block has a predecessor, it could be a
255       // result of codegen for __builtin_unreachable. In such case, do not
256       // remove the block.
257       if (BB.pred_size() == 0) {
258         BB.markValid(false);
259         ++NumRemoved;
260       }
261       continue;
262     }
263 
264     // The block should have just one successor.
265     BinaryBasicBlock *Successor = BB.getSuccessor();
266     assert(Successor && "invalid CFG encountered");
267 
268     // Redirect all predecessors to the successor block.
269     while (!BB.pred_empty()) {
270       BinaryBasicBlock *Predecessor = *BB.pred_begin();
271       if (Predecessor->hasJumpTable())
272         break;
273 
274       if (Predecessor == Successor)
275         break;
276 
277       BinaryBasicBlock::BinaryBranchInfo &BI = Predecessor->getBranchInfo(BB);
278       Predecessor->replaceSuccessor(&BB, Successor, BI.Count,
279                                     BI.MispredictedCount);
280       // We need to fix branches even if we failed to replace all successors
281       // and remove the block.
282       NeedsFixBranches = true;
283     }
284 
285     if (BB.pred_empty()) {
286       BB.removeAllSuccessors();
287       BB.markValid(false);
288       ++NumRemoved;
289     }
290   }
291 
292   if (NumRemoved)
293     BF.eraseInvalidBBs();
294 
295   // Check for duplicate successors. Do it after the empty block elimination as
296   // we can get more duplicate successors.
297   for (BinaryBasicBlock &BB : BF)
298     if (!BB.hasJumpTable() && BB.succ_size() == 2 &&
299         BB.getConditionalSuccessor(false) == BB.getConditionalSuccessor(true))
300       ++NumDuplicateEdges;
301 
302   // fixBranches() will get rid of duplicate edges and update jump instructions.
303   if (NumDuplicateEdges || NeedsFixBranches)
304     BF.fixBranches();
305 
306   NumDuplicateEdgesMerged += NumDuplicateEdges;
307   NumBlocksRemoved += NumRemoved;
308 }
309 
310 Error NormalizeCFG::runOnFunctions(BinaryContext &BC) {
311   ParallelUtilities::runOnEachFunction(
312       BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR,
313       [&](BinaryFunction &BF) { runOnFunction(BF); },
314       [&](const BinaryFunction &BF) { return !shouldOptimize(BF); },
315       "NormalizeCFG");
316   if (NumBlocksRemoved)
317     BC.outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block"
318               << (NumBlocksRemoved == 1 ? "" : "s") << '\n';
319   if (NumDuplicateEdgesMerged)
320     BC.outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged
321               << " duplicate CFG edge"
322               << (NumDuplicateEdgesMerged == 1 ? "" : "s") << '\n';
323   return Error::success();
324 }
325 
326 void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) {
327   BinaryContext &BC = Function.getBinaryContext();
328   unsigned Count;
329   uint64_t Bytes;
330   Function.markUnreachableBlocks();
331   LLVM_DEBUG({
332     for (BinaryBasicBlock &BB : Function) {
333       if (!BB.isValid()) {
334         dbgs() << "BOLT-INFO: UCE found unreachable block " << BB.getName()
335                << " in function " << Function << "\n";
336         Function.dump();
337       }
338     }
339   });
340   BinaryContext::IndependentCodeEmitter Emitter =
341       BC.createIndependentMCCodeEmitter();
342   std::tie(Count, Bytes) = Function.eraseInvalidBBs(Emitter.MCE.get());
343   DeletedBlocks += Count;
344   DeletedBytes += Bytes;
345   if (Count) {
346     auto L = BC.scopeLock();
347     Modified.insert(&Function);
348     if (opts::Verbosity > 0)
349       BC.outs() << "BOLT-INFO: removed " << Count
350                 << " dead basic block(s) accounting for " << Bytes
351                 << " bytes in function " << Function << '\n';
352   }
353 }
354 
355 Error EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) {
356   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
357     runOnFunction(BF);
358   };
359 
360   ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
361     return !shouldOptimize(BF) || BF.getLayout().block_empty();
362   };
363 
364   ParallelUtilities::runOnEachFunction(
365       BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun,
366       SkipPredicate, "elimininate-unreachable");
367 
368   if (DeletedBlocks)
369     BC.outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and "
370               << DeletedBytes << " bytes of code\n";
371   return Error::success();
372 }
373 
374 bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const {
375   return (BinaryFunctionPass::shouldPrint(BF) &&
376           opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE);
377 }
378 
379 bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction &BF) const {
380   // Apply execution count threshold
381   if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
382     return false;
383 
384   return BinaryFunctionPass::shouldOptimize(BF);
385 }
386 
387 Error ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
388   if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE)
389     return Error::success();
390 
391   std::atomic_uint64_t ModifiedFuncCount(0);
392   std::mutex FunctionEditDistanceMutex;
393   DenseMap<const BinaryFunction *, uint64_t> FunctionEditDistance;
394 
395   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
396     SmallVector<const BinaryBasicBlock *, 0> OldBlockOrder;
397     if (opts::PrintFuncStat > 0)
398       llvm::copy(BF.getLayout().blocks(), std::back_inserter(OldBlockOrder));
399 
400     const bool LayoutChanged =
401         modifyFunctionLayout(BF, opts::ReorderBlocks, opts::MinBranchClusters);
402     if (LayoutChanged) {
403       ModifiedFuncCount.fetch_add(1, std::memory_order_relaxed);
404       if (opts::PrintFuncStat > 0) {
405         const uint64_t Distance = BF.getLayout().getEditDistance(OldBlockOrder);
406         std::lock_guard<std::mutex> Lock(FunctionEditDistanceMutex);
407         FunctionEditDistance[&BF] = Distance;
408       }
409     }
410   };
411 
412   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
413     return !shouldOptimize(BF);
414   };
415 
416   ParallelUtilities::runOnEachFunction(
417       BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc,
418       "ReorderBasicBlocks");
419   const size_t NumAllProfiledFunctions =
420       BC.NumProfiledFuncs + BC.NumStaleProfileFuncs;
421 
422   BC.outs() << "BOLT-INFO: basic block reordering modified layout of "
423             << format(
424                    "%zu functions (%.2lf%% of profiled, %.2lf%% of total)\n",
425                    ModifiedFuncCount.load(std::memory_order_relaxed),
426                    100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) /
427                        NumAllProfiledFunctions,
428                    100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) /
429                        BC.getBinaryFunctions().size());
430 
431   if (opts::PrintFuncStat > 0) {
432     raw_ostream &OS = BC.outs();
433     // Copy all the values into vector in order to sort them
434     std::map<uint64_t, BinaryFunction &> ScoreMap;
435     auto &BFs = BC.getBinaryFunctions();
436     for (auto It = BFs.begin(); It != BFs.end(); ++It)
437       ScoreMap.insert(std::pair<uint64_t, BinaryFunction &>(
438           It->second.getFunctionScore(), It->second));
439 
440     OS << "\nBOLT-INFO: Printing Function Statistics:\n\n";
441     OS << "           There are " << BFs.size() << " functions in total. \n";
442     OS << "           Number of functions being modified: "
443        << ModifiedFuncCount.load(std::memory_order_relaxed) << "\n";
444     OS << "           User asks for detailed information on top "
445        << opts::PrintFuncStat << " functions. (Ranked by function score)"
446        << "\n\n";
447     uint64_t I = 0;
448     for (std::map<uint64_t, BinaryFunction &>::reverse_iterator Rit =
449              ScoreMap.rbegin();
450          Rit != ScoreMap.rend() && I < opts::PrintFuncStat; ++Rit, ++I) {
451       BinaryFunction &Function = Rit->second;
452 
453       OS << "           Information for function of top: " << (I + 1) << ": \n";
454       OS << "             Function Score is: " << Function.getFunctionScore()
455          << "\n";
456       OS << "             There are " << Function.size()
457          << " number of blocks in this function.\n";
458       OS << "             There are " << Function.getInstructionCount()
459          << " number of instructions in this function.\n";
460       OS << "             The edit distance for this function is: "
461          << FunctionEditDistance.lookup(&Function) << "\n\n";
462     }
463   }
464   return Error::success();
465 }
466 
467 bool ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF,
468                                               LayoutType Type,
469                                               bool MinBranchClusters) const {
470   if (BF.size() == 0 || Type == LT_NONE)
471     return false;
472 
473   BinaryFunction::BasicBlockOrderType NewLayout;
474   std::unique_ptr<ReorderAlgorithm> Algo;
475 
476   // Cannot do optimal layout without profile.
477   if (Type != LT_REVERSE && !BF.hasValidProfile())
478     return false;
479 
480   if (Type == LT_REVERSE) {
481     Algo.reset(new ReverseReorderAlgorithm());
482   } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) {
483     // Work on optimal solution if problem is small enough
484     LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n");
485     Algo.reset(new TSPReorderAlgorithm());
486   } else {
487     LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n");
488 
489     std::unique_ptr<ClusterAlgorithm> CAlgo;
490     if (MinBranchClusters)
491       CAlgo.reset(new MinBranchGreedyClusterAlgorithm());
492     else
493       CAlgo.reset(new PHGreedyClusterAlgorithm());
494 
495     switch (Type) {
496     case LT_OPTIMIZE:
497       Algo.reset(new OptimizeReorderAlgorithm(std::move(CAlgo)));
498       break;
499 
500     case LT_OPTIMIZE_BRANCH:
501       Algo.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo)));
502       break;
503 
504     case LT_OPTIMIZE_CACHE:
505       Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo)));
506       break;
507 
508     case LT_OPTIMIZE_EXT_TSP:
509       Algo.reset(new ExtTSPReorderAlgorithm());
510       break;
511 
512     case LT_OPTIMIZE_SHUFFLE:
513       Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo)));
514       break;
515 
516     default:
517       llvm_unreachable("unexpected layout type");
518     }
519   }
520 
521   Algo->reorderBasicBlocks(BF, NewLayout);
522 
523   return BF.getLayout().update(NewLayout);
524 }
525 
526 Error FixupBranches::runOnFunctions(BinaryContext &BC) {
527   for (auto &It : BC.getBinaryFunctions()) {
528     BinaryFunction &Function = It.second;
529     if (!BC.shouldEmit(Function) || !Function.isSimple())
530       continue;
531 
532     Function.fixBranches();
533   }
534   return Error::success();
535 }
536 
537 Error FinalizeFunctions::runOnFunctions(BinaryContext &BC) {
538   std::atomic<bool> HasFatal{false};
539   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
540     if (!BF.finalizeCFIState()) {
541       if (BC.HasRelocations) {
542         BC.errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF
543                   << ". Exiting.\n";
544         HasFatal = true;
545         return;
546       }
547       BF.setSimple(false);
548       return;
549     }
550 
551     BF.setFinalized();
552 
553     // Update exception handling information.
554     BF.updateEHRanges();
555   };
556 
557   ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
558     return !BC.shouldEmit(BF);
559   };
560 
561   ParallelUtilities::runOnEachFunction(
562       BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun,
563       SkipPredicate, "FinalizeFunctions");
564   if (HasFatal)
565     return createFatalBOLTError("finalize CFI state failure");
566   return Error::success();
567 }
568 
569 Error CheckLargeFunctions::runOnFunctions(BinaryContext &BC) {
570   if (BC.HasRelocations)
571     return Error::success();
572 
573   // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit
574   // incorrect meta data.
575   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
576     uint64_t HotSize, ColdSize;
577     std::tie(HotSize, ColdSize) =
578         BC.calculateEmittedSize(BF, /*FixBranches=*/false);
579     if (HotSize > BF.getMaxSize()) {
580       if (opts::PrintLargeFunctions)
581         BC.outs() << "BOLT-INFO: " << BF << " size exceeds allocated space by "
582                   << (HotSize - BF.getMaxSize()) << " bytes\n";
583       BF.setSimple(false);
584     }
585   };
586 
587   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
588     return !shouldOptimize(BF);
589   };
590 
591   ParallelUtilities::runOnEachFunction(
592       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
593       SkipFunc, "CheckLargeFunctions");
594 
595   return Error::success();
596 }
597 
598 bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const {
599   // Unlike other passes, allow functions in non-CFG state.
600   return BF.isSimple() && !BF.isIgnored();
601 }
602 
603 Error LowerAnnotations::runOnFunctions(BinaryContext &BC) {
604   // Convert GnuArgsSize annotations into CFIs.
605   for (BinaryFunction *BF : BC.getAllBinaryFunctions()) {
606     for (FunctionFragment &FF : BF->getLayout().fragments()) {
607       // Reset at the start of the new fragment.
608       int64_t CurrentGnuArgsSize = 0;
609 
610       for (BinaryBasicBlock *const BB : FF) {
611         for (auto II = BB->begin(); II != BB->end(); ++II) {
612           if (!BF->usesGnuArgsSize() || !BC.MIB->isInvoke(*II))
613             continue;
614 
615           const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(*II);
616           assert(NewGnuArgsSize >= 0 && "Expected non-negative GNU_args_size.");
617           if (NewGnuArgsSize == CurrentGnuArgsSize)
618             continue;
619 
620           auto InsertII = BF->addCFIInstruction(
621               BB, II,
622               MCCFIInstruction::createGnuArgsSize(nullptr, NewGnuArgsSize));
623           CurrentGnuArgsSize = NewGnuArgsSize;
624           II = std::next(InsertII);
625         }
626       }
627     }
628   }
629   return Error::success();
630 }
631 
632 // Check for dirty state in MCSymbol objects that might be a consequence
633 // of running calculateEmittedSize() in parallel, during split functions
634 // pass. If an inconsistent state is found (symbol already registered or
635 // already defined), clean it.
636 Error CleanMCState::runOnFunctions(BinaryContext &BC) {
637   MCContext &Ctx = *BC.Ctx;
638   for (const auto &SymMapEntry : Ctx.getSymbols()) {
639     const MCSymbol *S = SymMapEntry.getValue().Symbol;
640     if (!S)
641       continue;
642     if (S->isDefined()) {
643       LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName()
644                         << "\" is already defined\n");
645       const_cast<MCSymbol *>(S)->setUndefined();
646     }
647     if (S->isRegistered()) {
648       LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName()
649                         << "\" is already registered\n");
650       const_cast<MCSymbol *>(S)->setIsRegistered(false);
651     }
652     LLVM_DEBUG(if (S->isVariable()) {
653       dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() << "\" is variable\n";
654     });
655   }
656   return Error::success();
657 }
658 
659 // This peephole fixes jump instructions that jump to another basic
660 // block with a single jump instruction, e.g.
661 //
662 // B0: ...
663 //     jmp  B1   (or jcc B1)
664 //
665 // B1: jmp  B2
666 //
667 // ->
668 //
669 // B0: ...
670 //     jmp  B2   (or jcc B2)
671 //
672 static uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) {
673   uint64_t NumDoubleJumps = 0;
674 
675   MCContext *Ctx = Function.getBinaryContext().Ctx.get();
676   MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
677   for (BinaryBasicBlock &BB : Function) {
678     auto checkAndPatch = [&](BinaryBasicBlock *Pred, BinaryBasicBlock *Succ,
679                              const MCSymbol *SuccSym,
680                              std::optional<uint32_t> Offset) {
681       // Ignore infinite loop jumps or fallthrough tail jumps.
682       if (Pred == Succ || Succ == &BB)
683         return false;
684 
685       if (Succ) {
686         const MCSymbol *TBB = nullptr;
687         const MCSymbol *FBB = nullptr;
688         MCInst *CondBranch = nullptr;
689         MCInst *UncondBranch = nullptr;
690         bool Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
691         if (!Res) {
692           LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n";
693                      Pred->dump());
694           return false;
695         }
696         Pred->replaceSuccessor(&BB, Succ);
697 
698         // We must patch up any existing branch instructions to match up
699         // with the new successor.
700         assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) &&
701                "Predecessor block has inconsistent number of successors");
702         if (CondBranch && MIB->getTargetSymbol(*CondBranch) == BB.getLabel()) {
703           MIB->replaceBranchTarget(*CondBranch, Succ->getLabel(), Ctx);
704         } else if (UncondBranch &&
705                    MIB->getTargetSymbol(*UncondBranch) == BB.getLabel()) {
706           MIB->replaceBranchTarget(*UncondBranch, Succ->getLabel(), Ctx);
707         } else if (!UncondBranch) {
708           assert(Function.getLayout().getBasicBlockAfter(Pred, false) != Succ &&
709                  "Don't add an explicit jump to a fallthrough block.");
710           Pred->addBranchInstruction(Succ);
711         }
712       } else {
713         // Succ will be null in the tail call case.  In this case we
714         // need to explicitly add a tail call instruction.
715         MCInst *Branch = Pred->getLastNonPseudoInstr();
716         if (Branch && MIB->isUnconditionalBranch(*Branch)) {
717           assert(MIB->getTargetSymbol(*Branch) == BB.getLabel());
718           Pred->removeSuccessor(&BB);
719           Pred->eraseInstruction(Pred->findInstruction(Branch));
720           Pred->addTailCallInstruction(SuccSym);
721           if (Offset) {
722             MCInst *TailCall = Pred->getLastNonPseudoInstr();
723             assert(TailCall);
724             MIB->setOffset(*TailCall, *Offset);
725           }
726         } else {
727           return false;
728         }
729       }
730 
731       ++NumDoubleJumps;
732       LLVM_DEBUG(dbgs() << "Removed double jump in " << Function << " from "
733                         << Pred->getName() << " -> " << BB.getName() << " to "
734                         << Pred->getName() << " -> " << SuccSym->getName()
735                         << (!Succ ? " (tail)\n" : "\n"));
736 
737       return true;
738     };
739 
740     if (BB.getNumNonPseudos() != 1 || BB.isLandingPad())
741       continue;
742 
743     MCInst *Inst = BB.getFirstNonPseudoInstr();
744     const bool IsTailCall = MIB->isTailCall(*Inst);
745 
746     if (!MIB->isUnconditionalBranch(*Inst) && !IsTailCall)
747       continue;
748 
749     // If we operate after SCTC make sure it's not a conditional tail call.
750     if (IsTailCall && MIB->isConditionalBranch(*Inst))
751       continue;
752 
753     const MCSymbol *SuccSym = MIB->getTargetSymbol(*Inst);
754     BinaryBasicBlock *Succ = BB.getSuccessor();
755 
756     if (((!Succ || &BB == Succ) && !IsTailCall) || (IsTailCall && !SuccSym))
757       continue;
758 
759     std::vector<BinaryBasicBlock *> Preds = {BB.pred_begin(), BB.pred_end()};
760 
761     for (BinaryBasicBlock *Pred : Preds) {
762       if (Pred->isLandingPad())
763         continue;
764 
765       if (Pred->getSuccessor() == &BB ||
766           (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) ||
767           Pred->getConditionalSuccessor(false) == &BB)
768         if (checkAndPatch(Pred, Succ, SuccSym, MIB->getOffset(*Inst)) &&
769             MarkInvalid)
770           BB.markValid(BB.pred_size() != 0 || BB.isLandingPad() ||
771                        BB.isEntryPoint());
772     }
773   }
774 
775   return NumDoubleJumps;
776 }
777 
778 bool SimplifyConditionalTailCalls::shouldRewriteBranch(
779     const BinaryBasicBlock *PredBB, const MCInst &CondBranch,
780     const BinaryBasicBlock *BB, const bool DirectionFlag) {
781   if (BeenOptimized.count(PredBB))
782     return false;
783 
784   const bool IsForward = BinaryFunction::isForwardBranch(PredBB, BB);
785 
786   if (IsForward)
787     ++NumOrigForwardBranches;
788   else
789     ++NumOrigBackwardBranches;
790 
791   if (opts::SctcMode == opts::SctcAlways)
792     return true;
793 
794   if (opts::SctcMode == opts::SctcPreserveDirection)
795     return IsForward == DirectionFlag;
796 
797   const ErrorOr<std::pair<double, double>> Frequency =
798       PredBB->getBranchStats(BB);
799 
800   // It's ok to rewrite the conditional branch if the new target will be
801   // a backward branch.
802 
803   // If no data available for these branches, then it should be ok to
804   // do the optimization since it will reduce code size.
805   if (Frequency.getError())
806     return true;
807 
808   // TODO: should this use misprediction frequency instead?
809   const bool Result = (IsForward && Frequency.get().first >= 0.5) ||
810                       (!IsForward && Frequency.get().first <= 0.5);
811 
812   return Result == DirectionFlag;
813 }
814 
815 uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) {
816   // Need updated indices to correctly detect branch' direction.
817   BF.getLayout().updateLayoutIndices();
818   BF.markUnreachableBlocks();
819 
820   MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get();
821   MCContext *Ctx = BF.getBinaryContext().Ctx.get();
822   uint64_t NumLocalCTCCandidates = 0;
823   uint64_t NumLocalCTCs = 0;
824   uint64_t LocalCTCTakenCount = 0;
825   uint64_t LocalCTCExecCount = 0;
826   std::vector<std::pair<BinaryBasicBlock *, const BinaryBasicBlock *>>
827       NeedsUncondBranch;
828 
829   // Will block be deleted by UCE?
830   auto isValid = [](const BinaryBasicBlock *BB) {
831     return (BB->pred_size() != 0 || BB->isLandingPad() || BB->isEntryPoint());
832   };
833 
834   for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
835     // Locate BB with a single direct tail-call instruction.
836     if (BB->getNumNonPseudos() != 1)
837       continue;
838 
839     MCInst *Instr = BB->getFirstNonPseudoInstr();
840     if (!MIB->isTailCall(*Instr) || MIB->isConditionalBranch(*Instr))
841       continue;
842 
843     const MCSymbol *CalleeSymbol = MIB->getTargetSymbol(*Instr);
844     if (!CalleeSymbol)
845       continue;
846 
847     // Detect direction of the possible conditional tail call.
848     const bool IsForwardCTC = BF.isForwardCall(CalleeSymbol);
849 
850     // Iterate through all predecessors.
851     for (BinaryBasicBlock *PredBB : BB->predecessors()) {
852       BinaryBasicBlock *CondSucc = PredBB->getConditionalSuccessor(true);
853       if (!CondSucc)
854         continue;
855 
856       ++NumLocalCTCCandidates;
857 
858       const MCSymbol *TBB = nullptr;
859       const MCSymbol *FBB = nullptr;
860       MCInst *CondBranch = nullptr;
861       MCInst *UncondBranch = nullptr;
862       bool Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
863 
864       // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz
865       if (!Result) {
866         LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n";
867                    PredBB->dump());
868         continue;
869       }
870 
871       assert(Result && "internal error analyzing conditional branch");
872       assert(CondBranch && "conditional branch expected");
873 
874       // Skip dynamic branches for now.
875       if (BF.getBinaryContext().MIB->isDynamicBranch(*CondBranch))
876         continue;
877 
878       // It's possible that PredBB is also a successor to BB that may have
879       // been processed by a previous iteration of the SCTC loop, in which
880       // case it may have been marked invalid.  We should skip rewriting in
881       // this case.
882       if (!PredBB->isValid()) {
883         assert(PredBB->isSuccessor(BB) &&
884                "PredBB should be valid if it is not a successor to BB");
885         continue;
886       }
887 
888       // We don't want to reverse direction of the branch in new order
889       // without further profile analysis.
890       const bool DirectionFlag = CondSucc == BB ? IsForwardCTC : !IsForwardCTC;
891       if (!shouldRewriteBranch(PredBB, *CondBranch, BB, DirectionFlag))
892         continue;
893 
894       // Record this block so that we don't try to optimize it twice.
895       BeenOptimized.insert(PredBB);
896 
897       uint64_t Count = 0;
898       if (CondSucc != BB) {
899         // Patch the new target address into the conditional branch.
900         MIB->reverseBranchCondition(*CondBranch, CalleeSymbol, Ctx);
901         // Since we reversed the condition on the branch we need to change
902         // the target for the unconditional branch or add a unconditional
903         // branch to the old target.  This has to be done manually since
904         // fixupBranches is not called after SCTC.
905         NeedsUncondBranch.emplace_back(PredBB, CondSucc);
906         Count = PredBB->getFallthroughBranchInfo().Count;
907       } else {
908         // Change destination of the conditional branch.
909         MIB->replaceBranchTarget(*CondBranch, CalleeSymbol, Ctx);
910         Count = PredBB->getTakenBranchInfo().Count;
911       }
912       const uint64_t CTCTakenFreq =
913           Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count;
914 
915       // Annotate it, so "isCall" returns true for this jcc
916       MIB->setConditionalTailCall(*CondBranch);
917       // Add info about the conditional tail call frequency, otherwise this
918       // info will be lost when we delete the associated BranchInfo entry
919       auto &CTCAnnotation =
920           MIB->getOrCreateAnnotationAs<uint64_t>(*CondBranch, "CTCTakenCount");
921       CTCAnnotation = CTCTakenFreq;
922       // Preserve Offset annotation, used in BAT.
923       // Instr is a direct tail call instruction that was created when CTCs are
924       // first expanded, and has the original CTC offset set.
925       if (std::optional<uint32_t> Offset = MIB->getOffset(*Instr))
926         MIB->setOffset(*CondBranch, *Offset);
927 
928       // Remove the unused successor which may be eliminated later
929       // if there are no other users.
930       PredBB->removeSuccessor(BB);
931       // Update BB execution count
932       if (CTCTakenFreq && CTCTakenFreq <= BB->getKnownExecutionCount())
933         BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq);
934       else if (CTCTakenFreq > BB->getKnownExecutionCount())
935         BB->setExecutionCount(0);
936 
937       ++NumLocalCTCs;
938       LocalCTCTakenCount += CTCTakenFreq;
939       LocalCTCExecCount += PredBB->getKnownExecutionCount();
940     }
941 
942     // Remove the block from CFG if all predecessors were removed.
943     BB->markValid(isValid(BB));
944   }
945 
946   // Add unconditional branches at the end of BBs to new successors
947   // as long as the successor is not a fallthrough.
948   for (auto &Entry : NeedsUncondBranch) {
949     BinaryBasicBlock *PredBB = Entry.first;
950     const BinaryBasicBlock *CondSucc = Entry.second;
951 
952     const MCSymbol *TBB = nullptr;
953     const MCSymbol *FBB = nullptr;
954     MCInst *CondBranch = nullptr;
955     MCInst *UncondBranch = nullptr;
956     PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
957 
958     // Find the next valid block.  Invalid blocks will be deleted
959     // so they shouldn't be considered fallthrough targets.
960     const BinaryBasicBlock *NextBlock =
961         BF.getLayout().getBasicBlockAfter(PredBB, false);
962     while (NextBlock && !isValid(NextBlock))
963       NextBlock = BF.getLayout().getBasicBlockAfter(NextBlock, false);
964 
965     // Get the unconditional successor to this block.
966     const BinaryBasicBlock *PredSucc = PredBB->getSuccessor();
967     assert(PredSucc && "The other branch should be a tail call");
968 
969     const bool HasFallthrough = (NextBlock && PredSucc == NextBlock);
970 
971     if (UncondBranch) {
972       if (HasFallthrough)
973         PredBB->eraseInstruction(PredBB->findInstruction(UncondBranch));
974       else
975         MIB->replaceBranchTarget(*UncondBranch, CondSucc->getLabel(), Ctx);
976     } else if (!HasFallthrough) {
977       MCInst Branch;
978       MIB->createUncondBranch(Branch, CondSucc->getLabel(), Ctx);
979       PredBB->addInstruction(Branch);
980     }
981   }
982 
983   if (NumLocalCTCs > 0) {
984     NumDoubleJumps += fixDoubleJumps(BF, true);
985     // Clean-up unreachable tail-call blocks.
986     const std::pair<unsigned, uint64_t> Stats = BF.eraseInvalidBBs();
987     DeletedBlocks += Stats.first;
988     DeletedBytes += Stats.second;
989 
990     assert(BF.validateCFG());
991   }
992 
993   LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs
994                     << " conditional tail calls from a total of "
995                     << NumLocalCTCCandidates << " candidates in function " << BF
996                     << ". CTCs execution count for this function is "
997                     << LocalCTCExecCount << " and CTC taken count is "
998                     << LocalCTCTakenCount << "\n";);
999 
1000   NumTailCallsPatched += NumLocalCTCs;
1001   NumCandidateTailCalls += NumLocalCTCCandidates;
1002   CTCExecCount += LocalCTCExecCount;
1003   CTCTakenCount += LocalCTCTakenCount;
1004 
1005   return NumLocalCTCs > 0;
1006 }
1007 
1008 Error SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) {
1009   if (!BC.isX86())
1010     return Error::success();
1011 
1012   for (auto &It : BC.getBinaryFunctions()) {
1013     BinaryFunction &Function = It.second;
1014 
1015     if (!shouldOptimize(Function))
1016       continue;
1017 
1018     if (fixTailCalls(Function)) {
1019       Modified.insert(&Function);
1020       Function.setHasCanonicalCFG(false);
1021     }
1022   }
1023 
1024   if (NumTailCallsPatched)
1025     BC.outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched
1026               << " tail calls (" << NumOrigForwardBranches << " forward)"
1027               << " tail calls (" << NumOrigBackwardBranches << " backward)"
1028               << " from a total of " << NumCandidateTailCalls
1029               << " while removing " << NumDoubleJumps << " double jumps"
1030               << " and removing " << DeletedBlocks << " basic blocks"
1031               << " totalling " << DeletedBytes
1032               << " bytes of code. CTCs total execution count is "
1033               << CTCExecCount << " and the number of times CTCs are taken is "
1034               << CTCTakenCount << "\n";
1035   return Error::success();
1036 }
1037 
1038 uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) {
1039   uint64_t Count = 0;
1040   const BinaryContext &BC = Function.getBinaryContext();
1041   for (BinaryBasicBlock &BB : Function) {
1042     for (MCInst &Inst : BB) {
1043       // Skip shortening instructions with Size annotation.
1044       if (BC.MIB->getSize(Inst))
1045         continue;
1046 
1047       MCInst OriginalInst;
1048       if (opts::Verbosity > 2)
1049         OriginalInst = Inst;
1050 
1051       if (!BC.MIB->shortenInstruction(Inst, *BC.STI))
1052         continue;
1053 
1054       if (opts::Verbosity > 2) {
1055         BC.scopeLock();
1056         BC.outs() << "BOLT-INFO: shortening:\nBOLT-INFO:    ";
1057         BC.printInstruction(BC.outs(), OriginalInst, 0, &Function);
1058         BC.outs() << "BOLT-INFO: to:";
1059         BC.printInstruction(BC.outs(), Inst, 0, &Function);
1060       }
1061 
1062       ++Count;
1063     }
1064   }
1065 
1066   return Count;
1067 }
1068 
1069 Error ShortenInstructions::runOnFunctions(BinaryContext &BC) {
1070   std::atomic<uint64_t> NumShortened{0};
1071   if (!BC.isX86())
1072     return Error::success();
1073 
1074   ParallelUtilities::runOnEachFunction(
1075       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR,
1076       [&](BinaryFunction &BF) { NumShortened += shortenInstructions(BF); },
1077       nullptr, "ShortenInstructions");
1078 
1079   if (NumShortened)
1080     BC.outs() << "BOLT-INFO: " << NumShortened
1081               << " instructions were shortened\n";
1082   return Error::success();
1083 }
1084 
1085 void Peepholes::addTailcallTraps(BinaryFunction &Function) {
1086   MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
1087   for (BinaryBasicBlock &BB : Function) {
1088     MCInst *Inst = BB.getLastNonPseudoInstr();
1089     if (Inst && MIB->isTailCall(*Inst) && MIB->isIndirectBranch(*Inst)) {
1090       MCInst Trap;
1091       MIB->createTrap(Trap);
1092       BB.addInstruction(Trap);
1093       ++TailCallTraps;
1094     }
1095   }
1096 }
1097 
1098 void Peepholes::removeUselessCondBranches(BinaryFunction &Function) {
1099   for (BinaryBasicBlock &BB : Function) {
1100     if (BB.succ_size() != 2)
1101       continue;
1102 
1103     BinaryBasicBlock *CondBB = BB.getConditionalSuccessor(true);
1104     BinaryBasicBlock *UncondBB = BB.getConditionalSuccessor(false);
1105     if (CondBB != UncondBB)
1106       continue;
1107 
1108     const MCSymbol *TBB = nullptr;
1109     const MCSymbol *FBB = nullptr;
1110     MCInst *CondBranch = nullptr;
1111     MCInst *UncondBranch = nullptr;
1112     bool Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
1113 
1114     // analyzeBranch() can fail due to unusual branch instructions,
1115     // e.g. jrcxz, or jump tables (indirect jump).
1116     if (!Result || !CondBranch)
1117       continue;
1118 
1119     BB.removeDuplicateConditionalSuccessor(CondBranch);
1120     ++NumUselessCondBranches;
1121   }
1122 }
1123 
1124 Error Peepholes::runOnFunctions(BinaryContext &BC) {
1125   const char Opts =
1126       std::accumulate(opts::Peepholes.begin(), opts::Peepholes.end(), 0,
1127                       [](const char A, const PeepholeOpts B) { return A | B; });
1128   if (Opts == PEEP_NONE)
1129     return Error::success();
1130 
1131   for (auto &It : BC.getBinaryFunctions()) {
1132     BinaryFunction &Function = It.second;
1133     if (shouldOptimize(Function)) {
1134       if (Opts & PEEP_DOUBLE_JUMPS)
1135         NumDoubleJumps += fixDoubleJumps(Function, false);
1136       if (Opts & PEEP_TAILCALL_TRAPS)
1137         addTailcallTraps(Function);
1138       if (Opts & PEEP_USELESS_BRANCHES)
1139         removeUselessCondBranches(Function);
1140       assert(Function.validateCFG());
1141     }
1142   }
1143   BC.outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps
1144             << " double jumps patched.\n"
1145             << "BOLT-INFO: Peephole: " << TailCallTraps
1146             << " tail call traps inserted.\n"
1147             << "BOLT-INFO: Peephole: " << NumUselessCondBranches
1148             << " useless conditional branches removed.\n";
1149   return Error::success();
1150 }
1151 
1152 bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) {
1153   BinaryContext &BC = BF.getBinaryContext();
1154   MCPlusBuilder *MIB = BC.MIB.get();
1155 
1156   uint64_t NumLocalLoadsSimplified = 0;
1157   uint64_t NumDynamicLocalLoadsSimplified = 0;
1158   uint64_t NumLocalLoadsFound = 0;
1159   uint64_t NumDynamicLocalLoadsFound = 0;
1160 
1161   for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
1162     for (MCInst &Inst : *BB) {
1163       unsigned Opcode = Inst.getOpcode();
1164       const MCInstrDesc &Desc = BC.MII->get(Opcode);
1165 
1166       // Skip instructions that do not load from memory.
1167       if (!Desc.mayLoad())
1168         continue;
1169 
1170       // Try to statically evaluate the target memory address;
1171       uint64_t TargetAddress;
1172 
1173       if (MIB->hasPCRelOperand(Inst)) {
1174         // Try to find the symbol that corresponds to the PC-relative operand.
1175         MCOperand *DispOpI = MIB->getMemOperandDisp(Inst);
1176         assert(DispOpI != Inst.end() && "expected PC-relative displacement");
1177         assert(DispOpI->isExpr() &&
1178                "found PC-relative with non-symbolic displacement");
1179 
1180         // Get displacement symbol.
1181         const MCSymbol *DisplSymbol;
1182         uint64_t DisplOffset;
1183 
1184         std::tie(DisplSymbol, DisplOffset) =
1185             MIB->getTargetSymbolInfo(DispOpI->getExpr());
1186 
1187         if (!DisplSymbol)
1188           continue;
1189 
1190         // Look up the symbol address in the global symbols map of the binary
1191         // context object.
1192         BinaryData *BD = BC.getBinaryDataByName(DisplSymbol->getName());
1193         if (!BD)
1194           continue;
1195         TargetAddress = BD->getAddress() + DisplOffset;
1196       } else if (!MIB->evaluateMemOperandTarget(Inst, TargetAddress)) {
1197         continue;
1198       }
1199 
1200       // Get the contents of the section containing the target address of the
1201       // memory operand. We are only interested in read-only sections.
1202       ErrorOr<BinarySection &> DataSection =
1203           BC.getSectionForAddress(TargetAddress);
1204       if (!DataSection || DataSection->isWritable())
1205         continue;
1206 
1207       if (BC.getRelocationAt(TargetAddress) ||
1208           BC.getDynamicRelocationAt(TargetAddress))
1209         continue;
1210 
1211       uint32_t Offset = TargetAddress - DataSection->getAddress();
1212       StringRef ConstantData = DataSection->getContents();
1213 
1214       ++NumLocalLoadsFound;
1215       if (BB->hasProfile())
1216         NumDynamicLocalLoadsFound += BB->getExecutionCount();
1217 
1218       if (MIB->replaceMemOperandWithImm(Inst, ConstantData, Offset)) {
1219         ++NumLocalLoadsSimplified;
1220         if (BB->hasProfile())
1221           NumDynamicLocalLoadsSimplified += BB->getExecutionCount();
1222       }
1223     }
1224   }
1225 
1226   NumLoadsFound += NumLocalLoadsFound;
1227   NumDynamicLoadsFound += NumDynamicLocalLoadsFound;
1228   NumLoadsSimplified += NumLocalLoadsSimplified;
1229   NumDynamicLoadsSimplified += NumDynamicLocalLoadsSimplified;
1230 
1231   return NumLocalLoadsSimplified > 0;
1232 }
1233 
1234 Error SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) {
1235   for (auto &It : BC.getBinaryFunctions()) {
1236     BinaryFunction &Function = It.second;
1237     if (shouldOptimize(Function) && simplifyRODataLoads(Function))
1238       Modified.insert(&Function);
1239   }
1240 
1241   BC.outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of "
1242             << NumLoadsFound << " loads from a statically computed address.\n"
1243             << "BOLT-INFO: dynamic loads simplified: "
1244             << NumDynamicLoadsSimplified << "\n"
1245             << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound
1246             << "\n";
1247   return Error::success();
1248 }
1249 
1250 Error AssignSections::runOnFunctions(BinaryContext &BC) {
1251   for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) {
1252     Function->setCodeSectionName(BC.getInjectedCodeSectionName());
1253     Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName());
1254   }
1255 
1256   // In non-relocation mode functions have pre-assigned section names.
1257   if (!BC.HasRelocations)
1258     return Error::success();
1259 
1260   const bool UseColdSection =
1261       BC.NumProfiledFuncs > 0 ||
1262       opts::ReorderFunctions == ReorderFunctions::RT_USER;
1263   for (auto &BFI : BC.getBinaryFunctions()) {
1264     BinaryFunction &Function = BFI.second;
1265     if (opts::isHotTextMover(Function)) {
1266       Function.setCodeSectionName(BC.getHotTextMoverSectionName());
1267       Function.setColdCodeSectionName(BC.getHotTextMoverSectionName());
1268       continue;
1269     }
1270 
1271     if (!UseColdSection || Function.hasValidIndex())
1272       Function.setCodeSectionName(BC.getMainCodeSectionName());
1273     else
1274       Function.setCodeSectionName(BC.getColdCodeSectionName());
1275 
1276     if (Function.isSplit())
1277       Function.setColdCodeSectionName(BC.getColdCodeSectionName());
1278   }
1279   return Error::success();
1280 }
1281 
1282 Error PrintProfileStats::runOnFunctions(BinaryContext &BC) {
1283   double FlowImbalanceMean = 0.0;
1284   size_t NumBlocksConsidered = 0;
1285   double WorstBias = 0.0;
1286   const BinaryFunction *WorstBiasFunc = nullptr;
1287 
1288   // For each function CFG, we fill an IncomingMap with the sum of the frequency
1289   // of incoming edges for each BB. Likewise for each OutgoingMap and the sum
1290   // of the frequency of outgoing edges.
1291   using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>;
1292   std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps;
1293   std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps;
1294 
1295   // Compute mean
1296   for (const auto &BFI : BC.getBinaryFunctions()) {
1297     const BinaryFunction &Function = BFI.second;
1298     if (Function.empty() || !Function.isSimple())
1299       continue;
1300     FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
1301     FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
1302     for (const BinaryBasicBlock &BB : Function) {
1303       uint64_t TotalOutgoing = 0ULL;
1304       auto SuccBIIter = BB.branch_info_begin();
1305       for (BinaryBasicBlock *Succ : BB.successors()) {
1306         uint64_t Count = SuccBIIter->Count;
1307         if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) {
1308           ++SuccBIIter;
1309           continue;
1310         }
1311         TotalOutgoing += Count;
1312         IncomingMap[Succ] += Count;
1313         ++SuccBIIter;
1314       }
1315       OutgoingMap[&BB] = TotalOutgoing;
1316     }
1317 
1318     size_t NumBlocks = 0;
1319     double Mean = 0.0;
1320     for (const BinaryBasicBlock &BB : Function) {
1321       // Do not compute score for low frequency blocks, entry or exit blocks
1322       if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint())
1323         continue;
1324       ++NumBlocks;
1325       const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
1326       Mean += fabs(Difference / IncomingMap[&BB]);
1327     }
1328 
1329     FlowImbalanceMean += Mean;
1330     NumBlocksConsidered += NumBlocks;
1331     if (!NumBlocks)
1332       continue;
1333     double FuncMean = Mean / NumBlocks;
1334     if (FuncMean > WorstBias) {
1335       WorstBias = FuncMean;
1336       WorstBiasFunc = &Function;
1337     }
1338   }
1339   if (NumBlocksConsidered > 0)
1340     FlowImbalanceMean /= NumBlocksConsidered;
1341 
1342   // Compute standard deviation
1343   NumBlocksConsidered = 0;
1344   double FlowImbalanceVar = 0.0;
1345   for (const auto &BFI : BC.getBinaryFunctions()) {
1346     const BinaryFunction &Function = BFI.second;
1347     if (Function.empty() || !Function.isSimple())
1348       continue;
1349     FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
1350     FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
1351     for (const BinaryBasicBlock &BB : Function) {
1352       if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0)
1353         continue;
1354       ++NumBlocksConsidered;
1355       const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
1356       FlowImbalanceVar +=
1357           pow(fabs(Difference / IncomingMap[&BB]) - FlowImbalanceMean, 2);
1358     }
1359   }
1360   if (NumBlocksConsidered) {
1361     FlowImbalanceVar /= NumBlocksConsidered;
1362     FlowImbalanceVar = sqrt(FlowImbalanceVar);
1363   }
1364 
1365   // Report to user
1366   BC.outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n",
1367                       (100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar));
1368   if (WorstBiasFunc && opts::Verbosity >= 1) {
1369     BC.outs() << "Worst average bias observed in "
1370               << WorstBiasFunc->getPrintName() << "\n";
1371     LLVM_DEBUG(WorstBiasFunc->dump());
1372   }
1373   return Error::success();
1374 }
1375 
1376 Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
1377   uint64_t NumRegularFunctions = 0;
1378   uint64_t NumStaleProfileFunctions = 0;
1379   uint64_t NumAllStaleFunctions = 0;
1380   uint64_t NumInferredFunctions = 0;
1381   uint64_t NumNonSimpleProfiledFunctions = 0;
1382   uint64_t NumUnknownControlFlowFunctions = 0;
1383   uint64_t TotalSampleCount = 0;
1384   uint64_t StaleSampleCount = 0;
1385   uint64_t InferredSampleCount = 0;
1386   std::vector<const BinaryFunction *> ProfiledFunctions;
1387   const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n";
1388   for (auto &BFI : BC.getBinaryFunctions()) {
1389     const BinaryFunction &Function = BFI.second;
1390 
1391     // Ignore PLT functions for stats.
1392     if (Function.isPLTFunction())
1393       continue;
1394 
1395     // Adjustment for BAT mode: the profile for BOLT split fragments is combined
1396     // so only count the hot fragment.
1397     const uint64_t Address = Function.getAddress();
1398     bool IsHotParentOfBOLTSplitFunction = !Function.getFragments().empty() &&
1399                                           BAT && BAT->isBATFunction(Address) &&
1400                                           !BAT->fetchParentAddress(Address);
1401 
1402     ++NumRegularFunctions;
1403 
1404     // In BOLTed binaries split functions are non-simple (due to non-relocation
1405     // mode), but the original function is known to be simple and we have a
1406     // valid profile for it.
1407     if (!Function.isSimple() && !IsHotParentOfBOLTSplitFunction) {
1408       if (Function.hasProfile())
1409         ++NumNonSimpleProfiledFunctions;
1410       continue;
1411     }
1412 
1413     if (Function.hasUnknownControlFlow()) {
1414       if (opts::PrintUnknownCFG)
1415         Function.dump();
1416       else if (opts::PrintUnknown)
1417         BC.errs() << "function with unknown control flow: " << Function << '\n';
1418 
1419       ++NumUnknownControlFlowFunctions;
1420     }
1421 
1422     if (!Function.hasProfile())
1423       continue;
1424 
1425     uint64_t SampleCount = Function.getRawBranchCount();
1426     TotalSampleCount += SampleCount;
1427 
1428     if (Function.hasValidProfile()) {
1429       ProfiledFunctions.push_back(&Function);
1430       if (Function.hasInferredProfile()) {
1431         ++NumInferredFunctions;
1432         InferredSampleCount += SampleCount;
1433         ++NumAllStaleFunctions;
1434       }
1435     } else {
1436       if (opts::ReportStaleFuncs) {
1437         BC.outs() << StaleFuncsHeader;
1438         StaleFuncsHeader = "";
1439         BC.outs() << "  " << Function << '\n';
1440       }
1441       ++NumStaleProfileFunctions;
1442       StaleSampleCount += SampleCount;
1443       ++NumAllStaleFunctions;
1444     }
1445   }
1446   BC.NumProfiledFuncs = ProfiledFunctions.size();
1447   BC.NumStaleProfileFuncs = NumStaleProfileFunctions;
1448 
1449   const size_t NumAllProfiledFunctions =
1450       ProfiledFunctions.size() + NumStaleProfileFunctions;
1451   BC.outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of "
1452             << NumRegularFunctions << " functions in the binary ("
1453             << format("%.1f", NumAllProfiledFunctions /
1454                                   (float)NumRegularFunctions * 100.0f)
1455             << "%) have non-empty execution profile\n";
1456   if (NumNonSimpleProfiledFunctions) {
1457     BC.outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function"
1458               << (NumNonSimpleProfiledFunctions == 1 ? "" : "s")
1459               << " with profile could not be optimized\n";
1460   }
1461   if (NumAllStaleFunctions) {
1462     const float PctStale =
1463         NumAllStaleFunctions / (float)NumAllProfiledFunctions * 100.0f;
1464     const float PctStaleFuncsWithEqualBlockCount =
1465         (float)BC.Stats.NumStaleFuncsWithEqualBlockCount /
1466         NumAllStaleFunctions * 100.0f;
1467     const float PctStaleBlocksWithEqualIcount =
1468         (float)BC.Stats.NumStaleBlocksWithEqualIcount /
1469         BC.Stats.NumStaleBlocks * 100.0f;
1470     auto printErrorOrWarning = [&]() {
1471       if (PctStale > opts::StaleThreshold)
1472         BC.errs() << "BOLT-ERROR: ";
1473       else
1474         BC.errs() << "BOLT-WARNING: ";
1475     };
1476     printErrorOrWarning();
1477     BC.errs() << NumAllStaleFunctions
1478               << format(" (%.1f%% of all profiled)", PctStale) << " function"
1479               << (NumAllStaleFunctions == 1 ? "" : "s")
1480               << " have invalid (possibly stale) profile."
1481                  " Use -report-stale to see the list.\n";
1482     if (TotalSampleCount > 0) {
1483       printErrorOrWarning();
1484       BC.errs() << (StaleSampleCount + InferredSampleCount) << " out of "
1485                 << TotalSampleCount << " samples in the binary ("
1486                 << format("%.1f",
1487                           ((100.0f * (StaleSampleCount + InferredSampleCount)) /
1488                            TotalSampleCount))
1489                 << "%) belong to functions with invalid"
1490                    " (possibly stale) profile.\n";
1491     }
1492     BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleFuncsWithEqualBlockCount
1493               << " stale function"
1494               << (BC.Stats.NumStaleFuncsWithEqualBlockCount == 1 ? "" : "s")
1495               << format(" (%.1f%% of all stale)",
1496                         PctStaleFuncsWithEqualBlockCount)
1497               << " have matching block count.\n";
1498     BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleBlocksWithEqualIcount
1499               << " stale block"
1500               << (BC.Stats.NumStaleBlocksWithEqualIcount == 1 ? "" : "s")
1501               << format(" (%.1f%% of all stale)", PctStaleBlocksWithEqualIcount)
1502               << " have matching icount.\n";
1503     if (PctStale > opts::StaleThreshold) {
1504       return createFatalBOLTError(
1505           Twine("BOLT-ERROR: stale functions exceed specified threshold of ") +
1506           Twine(opts::StaleThreshold.getValue()) + Twine("%. Exiting.\n"));
1507     }
1508   }
1509   if (NumInferredFunctions) {
1510     BC.outs() << format(
1511         "BOLT-INFO: inferred profile for %d (%.2f%% of profiled, "
1512         "%.2f%% of stale) functions responsible for %.2f%% samples"
1513         " (%zu out of %zu)\n",
1514         NumInferredFunctions,
1515         100.0 * NumInferredFunctions / NumAllProfiledFunctions,
1516         100.0 * NumInferredFunctions / NumAllStaleFunctions,
1517         100.0 * InferredSampleCount / TotalSampleCount, InferredSampleCount,
1518         TotalSampleCount);
1519     BC.outs() << format(
1520         "BOLT-INFO: inference found an exact match for %.2f%% of basic blocks"
1521         " (%zu out of %zu stale) responsible for %.2f%% samples"
1522         " (%zu out of %zu stale)\n",
1523         100.0 * BC.Stats.NumMatchedBlocks / BC.Stats.NumStaleBlocks,
1524         BC.Stats.NumMatchedBlocks, BC.Stats.NumStaleBlocks,
1525         100.0 * BC.Stats.MatchedSampleCount / BC.Stats.StaleSampleCount,
1526         BC.Stats.MatchedSampleCount, BC.Stats.StaleSampleCount);
1527   }
1528 
1529   if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) {
1530     BC.outs() << "BOLT-INFO: profile for " << NumUnusedObjects
1531               << " objects was ignored\n";
1532   }
1533 
1534   if (ProfiledFunctions.size() > 10) {
1535     if (opts::Verbosity >= 1) {
1536       BC.outs() << "BOLT-INFO: top called functions are:\n";
1537       llvm::sort(ProfiledFunctions,
1538                  [](const BinaryFunction *A, const BinaryFunction *B) {
1539                    return B->getExecutionCount() < A->getExecutionCount();
1540                  });
1541       auto SFI = ProfiledFunctions.begin();
1542       auto SFIend = ProfiledFunctions.end();
1543       for (unsigned I = 0u; I < opts::TopCalledLimit && SFI != SFIend;
1544            ++SFI, ++I)
1545         BC.outs() << "  " << **SFI << " : " << (*SFI)->getExecutionCount()
1546                   << '\n';
1547     }
1548   }
1549 
1550   if (!opts::PrintSortedBy.empty()) {
1551     std::vector<BinaryFunction *> Functions;
1552     std::map<const BinaryFunction *, DynoStats> Stats;
1553 
1554     for (auto &BFI : BC.getBinaryFunctions()) {
1555       BinaryFunction &BF = BFI.second;
1556       if (shouldOptimize(BF) && BF.hasValidProfile()) {
1557         Functions.push_back(&BF);
1558         Stats.emplace(&BF, getDynoStats(BF));
1559       }
1560     }
1561 
1562     const bool SortAll =
1563         llvm::is_contained(opts::PrintSortedBy, DynoStats::LAST_DYNO_STAT);
1564 
1565     const bool Ascending =
1566         opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending;
1567 
1568     std::function<bool(const DynoStats &, const DynoStats &)>
1569         DynoStatsComparator =
1570             SortAll ? [](const DynoStats &StatsA,
1571                          const DynoStats &StatsB) { return StatsA < StatsB; }
1572                     : [](const DynoStats &StatsA, const DynoStats &StatsB) {
1573                         return StatsA.lessThan(StatsB, opts::PrintSortedBy);
1574                       };
1575 
1576     llvm::stable_sort(Functions,
1577                       [Ascending, &Stats, DynoStatsComparator](
1578                           const BinaryFunction *A, const BinaryFunction *B) {
1579                         auto StatsItr = Stats.find(A);
1580                         assert(StatsItr != Stats.end());
1581                         const DynoStats &StatsA = StatsItr->second;
1582 
1583                         StatsItr = Stats.find(B);
1584                         assert(StatsItr != Stats.end());
1585                         const DynoStats &StatsB = StatsItr->second;
1586 
1587                         return Ascending ? DynoStatsComparator(StatsA, StatsB)
1588                                          : DynoStatsComparator(StatsB, StatsA);
1589                       });
1590 
1591     BC.outs() << "BOLT-INFO: top functions sorted by ";
1592     if (SortAll) {
1593       BC.outs() << "dyno stats";
1594     } else {
1595       BC.outs() << "(";
1596       bool PrintComma = false;
1597       for (const DynoStats::Category Category : opts::PrintSortedBy) {
1598         if (PrintComma)
1599           BC.outs() << ", ";
1600         BC.outs() << DynoStats::Description(Category);
1601         PrintComma = true;
1602       }
1603       BC.outs() << ")";
1604     }
1605 
1606     BC.outs() << " are:\n";
1607     auto SFI = Functions.begin();
1608     for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) {
1609       const DynoStats Stats = getDynoStats(**SFI);
1610       BC.outs() << "  " << **SFI;
1611       if (!SortAll) {
1612         BC.outs() << " (";
1613         bool PrintComma = false;
1614         for (const DynoStats::Category Category : opts::PrintSortedBy) {
1615           if (PrintComma)
1616             BC.outs() << ", ";
1617           BC.outs() << dynoStatsOptName(Category) << "=" << Stats[Category];
1618           PrintComma = true;
1619         }
1620         BC.outs() << ")";
1621       }
1622       BC.outs() << "\n";
1623     }
1624   }
1625 
1626   if (!BC.TrappedFunctions.empty()) {
1627     BC.errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function"
1628               << (BC.TrappedFunctions.size() > 1 ? "s" : "")
1629               << " will trap on entry. Use -trap-avx512=0 to disable"
1630                  " traps.";
1631     if (opts::Verbosity >= 1 || BC.TrappedFunctions.size() <= 5) {
1632       BC.errs() << '\n';
1633       for (const BinaryFunction *Function : BC.TrappedFunctions)
1634         BC.errs() << "  " << *Function << '\n';
1635     } else {
1636       BC.errs() << " Use -v=1 to see the list.\n";
1637     }
1638   }
1639 
1640   // Print information on missed macro-fusion opportunities seen on input.
1641   if (BC.Stats.MissedMacroFusionPairs) {
1642     BC.outs() << format(
1643         "BOLT-INFO: the input contains %zu (dynamic count : %zu)"
1644         " opportunities for macro-fusion optimization",
1645         BC.Stats.MissedMacroFusionPairs, BC.Stats.MissedMacroFusionExecCount);
1646     switch (opts::AlignMacroOpFusion) {
1647     case MFT_NONE:
1648       BC.outs() << ". Use -align-macro-fusion to fix.\n";
1649       break;
1650     case MFT_HOT:
1651       BC.outs() << ". Will fix instances on a hot path.\n";
1652       break;
1653     case MFT_ALL:
1654       BC.outs() << " that are going to be fixed\n";
1655       break;
1656     }
1657   }
1658 
1659   // Collect and print information about suboptimal code layout on input.
1660   if (opts::ReportBadLayout) {
1661     std::vector<BinaryFunction *> SuboptimalFuncs;
1662     for (auto &BFI : BC.getBinaryFunctions()) {
1663       BinaryFunction &BF = BFI.second;
1664       if (!BF.hasValidProfile())
1665         continue;
1666 
1667       const uint64_t HotThreshold =
1668           std::max<uint64_t>(BF.getKnownExecutionCount(), 1);
1669       bool HotSeen = false;
1670       for (const BinaryBasicBlock *BB : BF.getLayout().rblocks()) {
1671         if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) {
1672           HotSeen = true;
1673           continue;
1674         }
1675         if (HotSeen && BB->getKnownExecutionCount() == 0) {
1676           SuboptimalFuncs.push_back(&BF);
1677           break;
1678         }
1679       }
1680     }
1681 
1682     if (!SuboptimalFuncs.empty()) {
1683       llvm::sort(SuboptimalFuncs,
1684                  [](const BinaryFunction *A, const BinaryFunction *B) {
1685                    return A->getKnownExecutionCount() / A->getSize() >
1686                           B->getKnownExecutionCount() / B->getSize();
1687                  });
1688 
1689       BC.outs() << "BOLT-INFO: " << SuboptimalFuncs.size()
1690                 << " functions have "
1691                    "cold code in the middle of hot code. Top functions are:\n";
1692       for (unsigned I = 0;
1693            I < std::min(static_cast<size_t>(opts::ReportBadLayout),
1694                         SuboptimalFuncs.size());
1695            ++I)
1696         SuboptimalFuncs[I]->print(BC.outs());
1697     }
1698   }
1699 
1700   if (NumUnknownControlFlowFunctions) {
1701     BC.outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions
1702               << " functions have instructions with unknown control flow";
1703     if (!opts::PrintUnknown)
1704       BC.outs() << ". Use -print-unknown to see the list.";
1705     BC.outs() << '\n';
1706   }
1707   return Error::success();
1708 }
1709 
1710 Error InstructionLowering::runOnFunctions(BinaryContext &BC) {
1711   for (auto &BFI : BC.getBinaryFunctions())
1712     for (BinaryBasicBlock &BB : BFI.second)
1713       for (MCInst &Instruction : BB)
1714         BC.MIB->lowerTailCall(Instruction);
1715   return Error::success();
1716 }
1717 
1718 Error StripRepRet::runOnFunctions(BinaryContext &BC) {
1719   if (!BC.isX86())
1720     return Error::success();
1721 
1722   uint64_t NumPrefixesRemoved = 0;
1723   uint64_t NumBytesSaved = 0;
1724   for (auto &BFI : BC.getBinaryFunctions()) {
1725     for (BinaryBasicBlock &BB : BFI.second) {
1726       auto LastInstRIter = BB.getLastNonPseudo();
1727       if (LastInstRIter == BB.rend() || !BC.MIB->isReturn(*LastInstRIter) ||
1728           !BC.MIB->deleteREPPrefix(*LastInstRIter))
1729         continue;
1730 
1731       NumPrefixesRemoved += BB.getKnownExecutionCount();
1732       ++NumBytesSaved;
1733     }
1734   }
1735 
1736   if (NumBytesSaved)
1737     BC.outs() << "BOLT-INFO: removed " << NumBytesSaved
1738               << " 'repz' prefixes"
1739                  " with estimated execution count of "
1740               << NumPrefixesRemoved << " times.\n";
1741   return Error::success();
1742 }
1743 
1744 Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
1745   if (!BC.isX86())
1746     return Error::success();
1747 
1748   uint64_t NumInlined = 0;
1749   uint64_t NumInlinedDyno = 0;
1750   for (auto &BFI : BC.getBinaryFunctions()) {
1751     for (BinaryBasicBlock &BB : BFI.second) {
1752       for (auto II = BB.begin(); II != BB.end(); ++II) {
1753         MCInst &Inst = *II;
1754 
1755         if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
1756             !Inst.getOperand(0).isExpr())
1757           continue;
1758 
1759         const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
1760         if (CalleeSymbol->getName() != "memcpy" &&
1761             CalleeSymbol->getName() != "memcpy@PLT" &&
1762             CalleeSymbol->getName() != "_memcpy8")
1763           continue;
1764 
1765         const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
1766         const bool IsTailCall = BC.MIB->isTailCall(Inst);
1767 
1768         const InstructionListType NewCode =
1769             BC.MIB->createInlineMemcpy(IsMemcpy8);
1770         II = BB.replaceInstruction(II, NewCode);
1771         std::advance(II, NewCode.size() - 1);
1772         if (IsTailCall) {
1773           MCInst Return;
1774           BC.MIB->createReturn(Return);
1775           II = BB.insertInstruction(std::next(II), std::move(Return));
1776         }
1777 
1778         ++NumInlined;
1779         NumInlinedDyno += BB.getKnownExecutionCount();
1780       }
1781     }
1782   }
1783 
1784   if (NumInlined) {
1785     BC.outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls";
1786     if (NumInlinedDyno)
1787       BC.outs() << ". The calls were executed " << NumInlinedDyno
1788                 << " times based on profile.";
1789     BC.outs() << '\n';
1790   }
1791   return Error::success();
1792 }
1793 
1794 bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const {
1795   if (!BinaryFunctionPass::shouldOptimize(Function))
1796     return false;
1797 
1798   for (const std::string &FunctionSpec : Spec) {
1799     StringRef FunctionName = StringRef(FunctionSpec).split(':').first;
1800     if (Function.hasNameRegex(FunctionName))
1801       return true;
1802   }
1803 
1804   return false;
1805 }
1806 
1807 std::set<size_t> SpecializeMemcpy1::getCallSitesToOptimize(
1808     const BinaryFunction &Function) const {
1809   StringRef SitesString;
1810   for (const std::string &FunctionSpec : Spec) {
1811     StringRef FunctionName;
1812     std::tie(FunctionName, SitesString) = StringRef(FunctionSpec).split(':');
1813     if (Function.hasNameRegex(FunctionName))
1814       break;
1815     SitesString = "";
1816   }
1817 
1818   std::set<size_t> Sites;
1819   SmallVector<StringRef, 4> SitesVec;
1820   SitesString.split(SitesVec, ':');
1821   for (StringRef SiteString : SitesVec) {
1822     if (SiteString.empty())
1823       continue;
1824     size_t Result;
1825     if (!SiteString.getAsInteger(10, Result))
1826       Sites.emplace(Result);
1827   }
1828 
1829   return Sites;
1830 }
1831 
1832 Error SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) {
1833   if (!BC.isX86())
1834     return Error::success();
1835 
1836   uint64_t NumSpecialized = 0;
1837   uint64_t NumSpecializedDyno = 0;
1838   for (auto &BFI : BC.getBinaryFunctions()) {
1839     BinaryFunction &Function = BFI.second;
1840     if (!shouldOptimize(Function))
1841       continue;
1842 
1843     std::set<size_t> CallsToOptimize = getCallSitesToOptimize(Function);
1844     auto shouldOptimize = [&](size_t N) {
1845       return CallsToOptimize.empty() || CallsToOptimize.count(N);
1846     };
1847 
1848     std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend());
1849     size_t CallSiteID = 0;
1850     for (BinaryBasicBlock *CurBB : Blocks) {
1851       for (auto II = CurBB->begin(); II != CurBB->end(); ++II) {
1852         MCInst &Inst = *II;
1853 
1854         if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
1855             !Inst.getOperand(0).isExpr())
1856           continue;
1857 
1858         const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
1859         if (CalleeSymbol->getName() != "memcpy" &&
1860             CalleeSymbol->getName() != "memcpy@PLT")
1861           continue;
1862 
1863         if (BC.MIB->isTailCall(Inst))
1864           continue;
1865 
1866         ++CallSiteID;
1867 
1868         if (!shouldOptimize(CallSiteID))
1869           continue;
1870 
1871         // Create a copy of a call to memcpy(dest, src, size).
1872         MCInst MemcpyInstr = Inst;
1873 
1874         BinaryBasicBlock *OneByteMemcpyBB = CurBB->splitAt(II);
1875 
1876         BinaryBasicBlock *NextBB = nullptr;
1877         if (OneByteMemcpyBB->getNumNonPseudos() > 1) {
1878           NextBB = OneByteMemcpyBB->splitAt(OneByteMemcpyBB->begin());
1879           NextBB->eraseInstruction(NextBB->begin());
1880         } else {
1881           NextBB = OneByteMemcpyBB->getSuccessor();
1882           OneByteMemcpyBB->eraseInstruction(OneByteMemcpyBB->begin());
1883           assert(NextBB && "unexpected call to memcpy() with no return");
1884         }
1885 
1886         BinaryBasicBlock *MemcpyBB = Function.addBasicBlock();
1887         MemcpyBB->setOffset(CurBB->getInputOffset());
1888         InstructionListType CmpJCC =
1889             BC.MIB->createCmpJE(BC.MIB->getIntArgRegister(2), 1,
1890                                 OneByteMemcpyBB->getLabel(), BC.Ctx.get());
1891         CurBB->addInstructions(CmpJCC);
1892         CurBB->addSuccessor(MemcpyBB);
1893 
1894         MemcpyBB->addInstruction(std::move(MemcpyInstr));
1895         MemcpyBB->addSuccessor(NextBB);
1896         MemcpyBB->setCFIState(NextBB->getCFIState());
1897         MemcpyBB->setExecutionCount(0);
1898 
1899         // To prevent the actual call from being moved to cold, we set its
1900         // execution count to 1.
1901         if (CurBB->getKnownExecutionCount() > 0)
1902           MemcpyBB->setExecutionCount(1);
1903 
1904         InstructionListType OneByteMemcpy = BC.MIB->createOneByteMemcpy();
1905         OneByteMemcpyBB->addInstructions(OneByteMemcpy);
1906 
1907         ++NumSpecialized;
1908         NumSpecializedDyno += CurBB->getKnownExecutionCount();
1909 
1910         CurBB = NextBB;
1911 
1912         // Note: we don't expect the next instruction to be a call to memcpy.
1913         II = CurBB->begin();
1914       }
1915     }
1916   }
1917 
1918   if (NumSpecialized) {
1919     BC.outs() << "BOLT-INFO: specialized " << NumSpecialized
1920               << " memcpy() call sites for size 1";
1921     if (NumSpecializedDyno)
1922       BC.outs() << ". The calls were executed " << NumSpecializedDyno
1923                 << " times based on profile.";
1924     BC.outs() << '\n';
1925   }
1926   return Error::success();
1927 }
1928 
1929 void RemoveNops::runOnFunction(BinaryFunction &BF) {
1930   const BinaryContext &BC = BF.getBinaryContext();
1931   for (BinaryBasicBlock &BB : BF) {
1932     for (int64_t I = BB.size() - 1; I >= 0; --I) {
1933       MCInst &Inst = BB.getInstructionAtIndex(I);
1934       if (BC.MIB->isNoop(Inst) && BC.MIB->hasAnnotation(Inst, "NOP"))
1935         BB.eraseInstructionAtIndex(I);
1936     }
1937   }
1938 }
1939 
1940 Error RemoveNops::runOnFunctions(BinaryContext &BC) {
1941   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
1942     runOnFunction(BF);
1943   };
1944 
1945   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
1946     return BF.shouldPreserveNops();
1947   };
1948 
1949   ParallelUtilities::runOnEachFunction(
1950       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
1951       SkipFunc, "RemoveNops");
1952   return Error::success();
1953 }
1954 
1955 } // namespace bolt
1956 } // namespace llvm
1957