xref: /llvm-project/bolt/lib/Passes/BinaryPasses.cpp (revision 3c64b24ed35fe6bb4a7a1fe304089e6069fcd14d)
1 //===- bolt/Passes/BinaryPasses.cpp - Binary-level passes -----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements multiple passes for binary optimization and analysis.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "bolt/Passes/BinaryPasses.h"
14 #include "bolt/Core/FunctionLayout.h"
15 #include "bolt/Core/ParallelUtilities.h"
16 #include "bolt/Passes/ReorderAlgorithm.h"
17 #include "bolt/Passes/ReorderFunctions.h"
18 #include "llvm/Support/CommandLine.h"
19 #include <atomic>
20 #include <mutex>
21 #include <numeric>
22 #include <vector>
23 
24 #define DEBUG_TYPE "bolt-opts"
25 
26 using namespace llvm;
27 using namespace bolt;
28 
29 static const char *dynoStatsOptName(const bolt::DynoStats::Category C) {
30   assert(C > bolt::DynoStats::FIRST_DYNO_STAT &&
31          C < DynoStats::LAST_DYNO_STAT && "Unexpected dyno stat category.");
32 
33   static std::string OptNames[bolt::DynoStats::LAST_DYNO_STAT + 1];
34 
35   OptNames[C] = bolt::DynoStats::Description(C);
36 
37   std::replace(OptNames[C].begin(), OptNames[C].end(), ' ', '-');
38 
39   return OptNames[C].c_str();
40 }
41 
42 namespace opts {
43 
44 extern cl::OptionCategory BoltCategory;
45 extern cl::OptionCategory BoltOptCategory;
46 
47 extern cl::opt<bolt::MacroFusionType> AlignMacroOpFusion;
48 extern cl::opt<unsigned> Verbosity;
49 extern cl::opt<bool> EnableBAT;
50 extern cl::opt<unsigned> ExecutionCountThreshold;
51 extern cl::opt<bool> UpdateDebugSections;
52 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions;
53 
54 enum DynoStatsSortOrder : char {
55   Ascending,
56   Descending
57 };
58 
59 static cl::opt<DynoStatsSortOrder> DynoStatsSortOrderOpt(
60     "print-sorted-by-order",
61     cl::desc("use ascending or descending order when printing functions "
62              "ordered by dyno stats"),
63     cl::init(DynoStatsSortOrder::Descending), cl::cat(BoltOptCategory));
64 
65 cl::list<std::string>
66 HotTextMoveSections("hot-text-move-sections",
67   cl::desc("list of sections containing functions used for hugifying hot text. "
68            "BOLT makes sure these functions are not placed on the same page as "
69            "the hot text. (default=\'.stub,.mover\')."),
70   cl::value_desc("sec1,sec2,sec3,..."),
71   cl::CommaSeparated,
72   cl::ZeroOrMore,
73   cl::cat(BoltCategory));
74 
75 bool isHotTextMover(const BinaryFunction &Function) {
76   for (std::string &SectionName : opts::HotTextMoveSections) {
77     if (Function.getOriginSectionName() &&
78         *Function.getOriginSectionName() == SectionName)
79       return true;
80   }
81 
82   return false;
83 }
84 
85 static cl::opt<bool> MinBranchClusters(
86     "min-branch-clusters",
87     cl::desc("use a modified clustering algorithm geared towards minimizing "
88              "branches"),
89     cl::Hidden, cl::cat(BoltOptCategory));
90 
91 static cl::list<Peepholes::PeepholeOpts> Peepholes(
92     "peepholes", cl::CommaSeparated, cl::desc("enable peephole optimizations"),
93     cl::value_desc("opt1,opt2,opt3,..."),
94     cl::values(clEnumValN(Peepholes::PEEP_NONE, "none", "disable peepholes"),
95                clEnumValN(Peepholes::PEEP_DOUBLE_JUMPS, "double-jumps",
96                           "remove double jumps when able"),
97                clEnumValN(Peepholes::PEEP_TAILCALL_TRAPS, "tailcall-traps",
98                           "insert tail call traps"),
99                clEnumValN(Peepholes::PEEP_USELESS_BRANCHES, "useless-branches",
100                           "remove useless conditional branches"),
101                clEnumValN(Peepholes::PEEP_ALL, "all",
102                           "enable all peephole optimizations")),
103     cl::ZeroOrMore, cl::cat(BoltOptCategory));
104 
105 static cl::opt<unsigned>
106     PrintFuncStat("print-function-statistics",
107                   cl::desc("print statistics about basic block ordering"),
108                   cl::init(0), cl::cat(BoltOptCategory));
109 
110 static cl::list<bolt::DynoStats::Category>
111     PrintSortedBy("print-sorted-by", cl::CommaSeparated,
112                   cl::desc("print functions sorted by order of dyno stats"),
113                   cl::value_desc("key1,key2,key3,..."),
114                   cl::values(
115 #define D(name, description, ...)                                              \
116   clEnumValN(bolt::DynoStats::name, dynoStatsOptName(bolt::DynoStats::name),   \
117              description),
118                       REAL_DYNO_STATS
119 #undef D
120                           clEnumValN(bolt::DynoStats::LAST_DYNO_STAT, "all",
121                                      "sorted by all names")),
122                   cl::ZeroOrMore, cl::cat(BoltOptCategory));
123 
124 static cl::opt<bool>
125     PrintUnknown("print-unknown",
126                  cl::desc("print names of functions with unknown control flow"),
127                  cl::cat(BoltCategory), cl::Hidden);
128 
129 static cl::opt<bool>
130     PrintUnknownCFG("print-unknown-cfg",
131                     cl::desc("dump CFG of functions with unknown control flow"),
132                     cl::cat(BoltCategory), cl::ReallyHidden);
133 
134 // Please MSVC19 with a forward declaration: otherwise it reports an error about
135 // an undeclared variable inside a callback.
136 extern cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks;
137 cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks(
138     "reorder-blocks", cl::desc("change layout of basic blocks in a function"),
139     cl::init(bolt::ReorderBasicBlocks::LT_NONE),
140     cl::values(
141         clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, "none",
142                    "do not reorder basic blocks"),
143         clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, "reverse",
144                    "layout blocks in reverse order"),
145         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, "normal",
146                    "perform optimal layout based on profile"),
147         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH,
148                    "branch-predictor",
149                    "perform optimal layout prioritizing branch "
150                    "predictions"),
151         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, "cache",
152                    "perform optimal layout prioritizing I-cache "
153                    "behavior"),
154         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS, "cache+",
155                    "perform layout optimizing I-cache behavior"),
156         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "ext-tsp",
157                    "perform layout optimizing I-cache behavior"),
158         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE,
159                    "cluster-shuffle", "perform random layout of clusters")),
160     cl::ZeroOrMore, cl::cat(BoltOptCategory),
161     cl::callback([](const bolt::ReorderBasicBlocks::LayoutType &option) {
162       if (option == bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS) {
163         errs() << "BOLT-WARNING: '-reorder-blocks=cache+' is deprecated, please"
164                << " use '-reorder-blocks=ext-tsp' instead\n";
165         ReorderBlocks = bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP;
166       }
167     }));
168 
169 static cl::opt<unsigned> ReportBadLayout(
170     "report-bad-layout",
171     cl::desc("print top <uint> functions with suboptimal code layout on input"),
172     cl::init(0), cl::Hidden, cl::cat(BoltOptCategory));
173 
174 static cl::opt<bool>
175     ReportStaleFuncs("report-stale",
176                      cl::desc("print the list of functions with stale profile"),
177                      cl::Hidden, cl::cat(BoltOptCategory));
178 
179 enum SctcModes : char {
180   SctcAlways,
181   SctcPreserveDirection,
182   SctcHeuristic
183 };
184 
185 static cl::opt<SctcModes>
186 SctcMode("sctc-mode",
187   cl::desc("mode for simplify conditional tail calls"),
188   cl::init(SctcAlways),
189   cl::values(clEnumValN(SctcAlways, "always", "always perform sctc"),
190     clEnumValN(SctcPreserveDirection,
191       "preserve",
192       "only perform sctc when branch direction is "
193       "preserved"),
194     clEnumValN(SctcHeuristic,
195       "heuristic",
196       "use branch prediction data to control sctc")),
197   cl::ZeroOrMore,
198   cl::cat(BoltOptCategory));
199 
200 static cl::opt<unsigned>
201 StaleThreshold("stale-threshold",
202     cl::desc(
203       "maximum percentage of stale functions to tolerate (default: 100)"),
204     cl::init(100),
205     cl::Hidden,
206     cl::cat(BoltOptCategory));
207 
208 static cl::opt<unsigned> TSPThreshold(
209     "tsp-threshold",
210     cl::desc(
211         "maximum number of hot basic blocks in a function for which to use "
212         "a precise TSP solution while re-ordering basic blocks"),
213     cl::init(10), cl::Hidden, cl::cat(BoltOptCategory));
214 
215 static cl::opt<unsigned> TopCalledLimit(
216     "top-called-limit",
217     cl::desc("maximum number of functions to print in top called "
218              "functions section"),
219     cl::init(100), cl::Hidden, cl::cat(BoltCategory));
220 
221 } // namespace opts
222 
223 namespace llvm {
224 namespace bolt {
225 
226 bool BinaryFunctionPass::shouldOptimize(const BinaryFunction &BF) const {
227   return BF.isSimple() && BF.getState() == BinaryFunction::State::CFG &&
228          !BF.isIgnored();
229 }
230 
231 bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const {
232   return BF.isSimple() && !BF.isIgnored();
233 }
234 
235 void NormalizeCFG::runOnFunction(BinaryFunction &BF) {
236   uint64_t NumRemoved = 0;
237   uint64_t NumDuplicateEdges = 0;
238   uint64_t NeedsFixBranches = 0;
239   for (BinaryBasicBlock &BB : BF) {
240     if (!BB.empty())
241       continue;
242 
243     if (BB.isEntryPoint() || BB.isLandingPad())
244       continue;
245 
246     // Handle a dangling empty block.
247     if (BB.succ_size() == 0) {
248       // If an empty dangling basic block has a predecessor, it could be a
249       // result of codegen for __builtin_unreachable. In such case, do not
250       // remove the block.
251       if (BB.pred_size() == 0) {
252         BB.markValid(false);
253         ++NumRemoved;
254       }
255       continue;
256     }
257 
258     // The block should have just one successor.
259     BinaryBasicBlock *Successor = BB.getSuccessor();
260     assert(Successor && "invalid CFG encountered");
261 
262     // Redirect all predecessors to the successor block.
263     while (!BB.pred_empty()) {
264       BinaryBasicBlock *Predecessor = *BB.pred_begin();
265       if (Predecessor->hasJumpTable())
266         break;
267 
268       if (Predecessor == Successor)
269         break;
270 
271       BinaryBasicBlock::BinaryBranchInfo &BI = Predecessor->getBranchInfo(BB);
272       Predecessor->replaceSuccessor(&BB, Successor, BI.Count,
273                                     BI.MispredictedCount);
274       // We need to fix branches even if we failed to replace all successors
275       // and remove the block.
276       NeedsFixBranches = true;
277     }
278 
279     if (BB.pred_empty()) {
280       BB.removeAllSuccessors();
281       BB.markValid(false);
282       ++NumRemoved;
283     }
284   }
285 
286   if (NumRemoved)
287     BF.eraseInvalidBBs();
288 
289   // Check for duplicate successors. Do it after the empty block elimination as
290   // we can get more duplicate successors.
291   for (BinaryBasicBlock &BB : BF)
292     if (!BB.hasJumpTable() && BB.succ_size() == 2 &&
293         BB.getConditionalSuccessor(false) == BB.getConditionalSuccessor(true))
294       ++NumDuplicateEdges;
295 
296   // fixBranches() will get rid of duplicate edges and update jump instructions.
297   if (NumDuplicateEdges || NeedsFixBranches)
298     BF.fixBranches();
299 
300   NumDuplicateEdgesMerged += NumDuplicateEdges;
301   NumBlocksRemoved += NumRemoved;
302 }
303 
304 void NormalizeCFG::runOnFunctions(BinaryContext &BC) {
305   ParallelUtilities::runOnEachFunction(
306       BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR,
307       [&](BinaryFunction &BF) { runOnFunction(BF); },
308       [&](const BinaryFunction &BF) { return !shouldOptimize(BF); },
309       "NormalizeCFG");
310   if (NumBlocksRemoved)
311     outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block"
312            << (NumBlocksRemoved == 1 ? "" : "s") << '\n';
313   if (NumDuplicateEdgesMerged)
314     outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged
315            << " duplicate CFG edge" << (NumDuplicateEdgesMerged == 1 ? "" : "s")
316            << '\n';
317 }
318 
319 void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) {
320   BinaryContext &BC = Function.getBinaryContext();
321   unsigned Count;
322   uint64_t Bytes;
323   Function.markUnreachableBlocks();
324   LLVM_DEBUG({
325     for (BinaryBasicBlock &BB : Function) {
326       if (!BB.isValid()) {
327         dbgs() << "BOLT-INFO: UCE found unreachable block " << BB.getName()
328                << " in function " << Function << "\n";
329         Function.dump();
330       }
331     }
332   });
333   BinaryContext::IndependentCodeEmitter Emitter =
334       BC.createIndependentMCCodeEmitter();
335   std::tie(Count, Bytes) = Function.eraseInvalidBBs(Emitter.MCE.get());
336   DeletedBlocks += Count;
337   DeletedBytes += Bytes;
338   if (Count) {
339     auto L = BC.scopeLock();
340     Modified.insert(&Function);
341     if (opts::Verbosity > 0)
342       outs() << "BOLT-INFO: removed " << Count
343              << " dead basic block(s) accounting for " << Bytes
344              << " bytes in function " << Function << '\n';
345   }
346 }
347 
348 void EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) {
349   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
350     runOnFunction(BF);
351   };
352 
353   ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
354     return !shouldOptimize(BF) || BF.getLayout().block_empty();
355   };
356 
357   ParallelUtilities::runOnEachFunction(
358       BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun,
359       SkipPredicate, "elimininate-unreachable");
360 
361   if (DeletedBlocks)
362     outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and "
363            << DeletedBytes << " bytes of code\n";
364 }
365 
366 bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const {
367   return (BinaryFunctionPass::shouldPrint(BF) &&
368           opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE);
369 }
370 
371 bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction &BF) const {
372   // Apply execution count threshold
373   if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
374     return false;
375 
376   return BinaryFunctionPass::shouldOptimize(BF);
377 }
378 
379 void ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
380   if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE)
381     return;
382 
383   std::atomic_uint64_t ModifiedFuncCount(0);
384   std::mutex FunctionEditDistanceMutex;
385   DenseMap<const BinaryFunction *, uint64_t> FunctionEditDistance;
386 
387   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
388     SmallVector<const BinaryBasicBlock *, 0> OldBlockOrder;
389     if (opts::PrintFuncStat > 0)
390       llvm::copy(BF.getLayout().blocks(), std::back_inserter(OldBlockOrder));
391 
392     const bool LayoutChanged =
393         modifyFunctionLayout(BF, opts::ReorderBlocks, opts::MinBranchClusters);
394     if (LayoutChanged) {
395       ModifiedFuncCount.fetch_add(1, std::memory_order_relaxed);
396       if (opts::PrintFuncStat > 0) {
397         const uint64_t Distance = BF.getLayout().getEditDistance(OldBlockOrder);
398         std::lock_guard<std::mutex> Lock(FunctionEditDistanceMutex);
399         FunctionEditDistance[&BF] = Distance;
400       }
401     }
402   };
403 
404   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
405     return !shouldOptimize(BF);
406   };
407 
408   ParallelUtilities::runOnEachFunction(
409       BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc,
410       "ReorderBasicBlocks");
411   const size_t NumAllProfiledFunctions =
412       BC.NumProfiledFuncs + BC.NumStaleProfileFuncs;
413 
414   outs() << "BOLT-INFO: basic block reordering modified layout of "
415          << format("%zu functions (%.2lf%% of profiled, %.2lf%% of total)\n",
416                    ModifiedFuncCount.load(std::memory_order_relaxed),
417                    100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) /
418                        NumAllProfiledFunctions,
419                    100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) /
420                        BC.getBinaryFunctions().size());
421 
422   if (opts::PrintFuncStat > 0) {
423     raw_ostream &OS = outs();
424     // Copy all the values into vector in order to sort them
425     std::map<uint64_t, BinaryFunction &> ScoreMap;
426     auto &BFs = BC.getBinaryFunctions();
427     for (auto It = BFs.begin(); It != BFs.end(); ++It)
428       ScoreMap.insert(std::pair<uint64_t, BinaryFunction &>(
429           It->second.getFunctionScore(), It->second));
430 
431     OS << "\nBOLT-INFO: Printing Function Statistics:\n\n";
432     OS << "           There are " << BFs.size() << " functions in total. \n";
433     OS << "           Number of functions being modified: "
434        << ModifiedFuncCount.load(std::memory_order_relaxed) << "\n";
435     OS << "           User asks for detailed information on top "
436        << opts::PrintFuncStat << " functions. (Ranked by function score)"
437        << "\n\n";
438     uint64_t I = 0;
439     for (std::map<uint64_t, BinaryFunction &>::reverse_iterator Rit =
440              ScoreMap.rbegin();
441          Rit != ScoreMap.rend() && I < opts::PrintFuncStat; ++Rit, ++I) {
442       BinaryFunction &Function = Rit->second;
443 
444       OS << "           Information for function of top: " << (I + 1) << ": \n";
445       OS << "             Function Score is: " << Function.getFunctionScore()
446          << "\n";
447       OS << "             There are " << Function.size()
448          << " number of blocks in this function.\n";
449       OS << "             There are " << Function.getInstructionCount()
450          << " number of instructions in this function.\n";
451       OS << "             The edit distance for this function is: "
452          << FunctionEditDistance.lookup(&Function) << "\n\n";
453     }
454   }
455 }
456 
457 bool ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF,
458                                               LayoutType Type,
459                                               bool MinBranchClusters) const {
460   if (BF.size() == 0 || Type == LT_NONE)
461     return false;
462 
463   BinaryFunction::BasicBlockOrderType NewLayout;
464   std::unique_ptr<ReorderAlgorithm> Algo;
465 
466   // Cannot do optimal layout without profile.
467   if (Type != LT_REVERSE && !BF.hasValidProfile())
468     return false;
469 
470   if (Type == LT_REVERSE) {
471     Algo.reset(new ReverseReorderAlgorithm());
472   } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) {
473     // Work on optimal solution if problem is small enough
474     LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n");
475     Algo.reset(new TSPReorderAlgorithm());
476   } else {
477     LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n");
478 
479     std::unique_ptr<ClusterAlgorithm> CAlgo;
480     if (MinBranchClusters)
481       CAlgo.reset(new MinBranchGreedyClusterAlgorithm());
482     else
483       CAlgo.reset(new PHGreedyClusterAlgorithm());
484 
485     switch (Type) {
486     case LT_OPTIMIZE:
487       Algo.reset(new OptimizeReorderAlgorithm(std::move(CAlgo)));
488       break;
489 
490     case LT_OPTIMIZE_BRANCH:
491       Algo.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo)));
492       break;
493 
494     case LT_OPTIMIZE_CACHE:
495       Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo)));
496       break;
497 
498     case LT_OPTIMIZE_EXT_TSP:
499       Algo.reset(new ExtTSPReorderAlgorithm());
500       break;
501 
502     case LT_OPTIMIZE_SHUFFLE:
503       Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo)));
504       break;
505 
506     default:
507       llvm_unreachable("unexpected layout type");
508     }
509   }
510 
511   Algo->reorderBasicBlocks(BF, NewLayout);
512 
513   return BF.getLayout().update(NewLayout);
514 }
515 
516 void FixupBranches::runOnFunctions(BinaryContext &BC) {
517   for (auto &It : BC.getBinaryFunctions()) {
518     BinaryFunction &Function = It.second;
519     if (!BC.shouldEmit(Function) || !Function.isSimple())
520       continue;
521 
522     Function.fixBranches();
523   }
524 }
525 
526 void FinalizeFunctions::runOnFunctions(BinaryContext &BC) {
527   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
528     if (!BF.finalizeCFIState()) {
529       if (BC.HasRelocations) {
530         errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF
531                << ". Exiting.\n";
532         exit(1);
533       }
534       BF.setSimple(false);
535       return;
536     }
537 
538     BF.setFinalized();
539 
540     // Update exception handling information.
541     BF.updateEHRanges();
542   };
543 
544   ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
545     return !BC.shouldEmit(BF);
546   };
547 
548   ParallelUtilities::runOnEachFunction(
549       BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun,
550       SkipPredicate, "FinalizeFunctions");
551 }
552 
553 void CheckLargeFunctions::runOnFunctions(BinaryContext &BC) {
554   if (BC.HasRelocations)
555     return;
556 
557   if (!opts::UpdateDebugSections)
558     return;
559 
560   // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit
561   // incorrect debug info.
562   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
563     uint64_t HotSize, ColdSize;
564     std::tie(HotSize, ColdSize) =
565         BC.calculateEmittedSize(BF, /*FixBranches=*/false);
566     if (HotSize > BF.getMaxSize())
567       BF.setSimple(false);
568   };
569 
570   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
571     return !shouldOptimize(BF);
572   };
573 
574   ParallelUtilities::runOnEachFunction(
575       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
576       SkipFunc, "CheckLargeFunctions");
577 }
578 
579 bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const {
580   // Unlike other passes, allow functions in non-CFG state.
581   return BF.isSimple() && !BF.isIgnored();
582 }
583 
584 void LowerAnnotations::runOnFunctions(BinaryContext &BC) {
585   for (BinaryFunction *BF : BC.getAllBinaryFunctions()) {
586     for (FunctionFragment &FF : BF->getLayout().fragments()) {
587       // Reset at the start of the new fragment.
588       int64_t CurrentGnuArgsSize = 0;
589 
590       for (BinaryBasicBlock *const BB : FF) {
591         for (auto II = BB->begin(); II != BB->end(); ++II) {
592 
593           // Convert GnuArgsSize annotations into CFIs.
594           if (BF->usesGnuArgsSize() && BC.MIB->isInvoke(*II)) {
595             const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(*II);
596             assert(NewGnuArgsSize >= 0 &&
597                    "Expected non-negative GNU_args_size.");
598             if (NewGnuArgsSize != CurrentGnuArgsSize) {
599               auto InsertII = BF->addCFIInstruction(
600                   BB, II,
601                   MCCFIInstruction::createGnuArgsSize(nullptr, NewGnuArgsSize));
602               CurrentGnuArgsSize = NewGnuArgsSize;
603               II = std::next(InsertII);
604             }
605           }
606 
607           // Preserve selected annotations and strip the rest.
608           std::optional<uint32_t> Offset = BF->requiresAddressTranslation()
609                                                ? BC.MIB->getOffset(*II)
610                                                : std::nullopt;
611           std::optional<uint32_t> Size = BC.MIB->getSize(*II);
612           MCSymbol *Label = BC.MIB->getLabel(*II);
613 
614           BC.MIB->stripAnnotations(*II);
615 
616           if (Offset)
617             BC.MIB->setOffset(*II, *Offset);
618           if (Size)
619             BC.MIB->setSize(*II, *Size);
620           if (Label)
621             BC.MIB->setLabel(*II, Label);
622         }
623       }
624     }
625   }
626 
627   // Release all memory taken by annotations
628   BC.MIB->freeAnnotations();
629 }
630 
631 // Check for dirty state in MCSymbol objects that might be a consequence
632 // of running calculateEmittedSize() in parallel, during split functions
633 // pass. If an inconsistent state is found (symbol already registered or
634 // already defined), clean it.
635 void CleanMCState::runOnFunctions(BinaryContext &BC) {
636   MCContext &Ctx = *BC.Ctx;
637   for (const auto &SymMapEntry : Ctx.getSymbols()) {
638     const MCSymbol *S = SymMapEntry.second;
639     if (S->isDefined()) {
640       LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName()
641                         << "\" is already defined\n");
642       const_cast<MCSymbol *>(S)->setUndefined();
643     }
644     if (S->isRegistered()) {
645       LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName()
646                         << "\" is already registered\n");
647       const_cast<MCSymbol *>(S)->setIsRegistered(false);
648     }
649     LLVM_DEBUG(if (S->isVariable()) {
650       dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() << "\" is variable\n";
651     });
652   }
653 }
654 
655 // This peephole fixes jump instructions that jump to another basic
656 // block with a single jump instruction, e.g.
657 //
658 // B0: ...
659 //     jmp  B1   (or jcc B1)
660 //
661 // B1: jmp  B2
662 //
663 // ->
664 //
665 // B0: ...
666 //     jmp  B2   (or jcc B2)
667 //
668 static uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) {
669   uint64_t NumDoubleJumps = 0;
670 
671   MCContext *Ctx = Function.getBinaryContext().Ctx.get();
672   MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
673   for (BinaryBasicBlock &BB : Function) {
674     auto checkAndPatch = [&](BinaryBasicBlock *Pred, BinaryBasicBlock *Succ,
675                              const MCSymbol *SuccSym) {
676       // Ignore infinite loop jumps or fallthrough tail jumps.
677       if (Pred == Succ || Succ == &BB)
678         return false;
679 
680       if (Succ) {
681         const MCSymbol *TBB = nullptr;
682         const MCSymbol *FBB = nullptr;
683         MCInst *CondBranch = nullptr;
684         MCInst *UncondBranch = nullptr;
685         bool Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
686         if (!Res) {
687           LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n";
688                      Pred->dump());
689           return false;
690         }
691         Pred->replaceSuccessor(&BB, Succ);
692 
693         // We must patch up any existing branch instructions to match up
694         // with the new successor.
695         assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) &&
696                "Predecessor block has inconsistent number of successors");
697         if (CondBranch && MIB->getTargetSymbol(*CondBranch) == BB.getLabel()) {
698           MIB->replaceBranchTarget(*CondBranch, Succ->getLabel(), Ctx);
699         } else if (UncondBranch &&
700                    MIB->getTargetSymbol(*UncondBranch) == BB.getLabel()) {
701           MIB->replaceBranchTarget(*UncondBranch, Succ->getLabel(), Ctx);
702         } else if (!UncondBranch) {
703           assert(Function.getLayout().getBasicBlockAfter(Pred, false) != Succ &&
704                  "Don't add an explicit jump to a fallthrough block.");
705           Pred->addBranchInstruction(Succ);
706         }
707       } else {
708         // Succ will be null in the tail call case.  In this case we
709         // need to explicitly add a tail call instruction.
710         MCInst *Branch = Pred->getLastNonPseudoInstr();
711         if (Branch && MIB->isUnconditionalBranch(*Branch)) {
712           assert(MIB->getTargetSymbol(*Branch) == BB.getLabel());
713           Pred->removeSuccessor(&BB);
714           Pred->eraseInstruction(Pred->findInstruction(Branch));
715           Pred->addTailCallInstruction(SuccSym);
716         } else {
717           return false;
718         }
719       }
720 
721       ++NumDoubleJumps;
722       LLVM_DEBUG(dbgs() << "Removed double jump in " << Function << " from "
723                         << Pred->getName() << " -> " << BB.getName() << " to "
724                         << Pred->getName() << " -> " << SuccSym->getName()
725                         << (!Succ ? " (tail)\n" : "\n"));
726 
727       return true;
728     };
729 
730     if (BB.getNumNonPseudos() != 1 || BB.isLandingPad())
731       continue;
732 
733     MCInst *Inst = BB.getFirstNonPseudoInstr();
734     const bool IsTailCall = MIB->isTailCall(*Inst);
735 
736     if (!MIB->isUnconditionalBranch(*Inst) && !IsTailCall)
737       continue;
738 
739     // If we operate after SCTC make sure it's not a conditional tail call.
740     if (IsTailCall && MIB->isConditionalBranch(*Inst))
741       continue;
742 
743     const MCSymbol *SuccSym = MIB->getTargetSymbol(*Inst);
744     BinaryBasicBlock *Succ = BB.getSuccessor();
745 
746     if (((!Succ || &BB == Succ) && !IsTailCall) || (IsTailCall && !SuccSym))
747       continue;
748 
749     std::vector<BinaryBasicBlock *> Preds = {BB.pred_begin(), BB.pred_end()};
750 
751     for (BinaryBasicBlock *Pred : Preds) {
752       if (Pred->isLandingPad())
753         continue;
754 
755       if (Pred->getSuccessor() == &BB ||
756           (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) ||
757           Pred->getConditionalSuccessor(false) == &BB)
758         if (checkAndPatch(Pred, Succ, SuccSym) && MarkInvalid)
759           BB.markValid(BB.pred_size() != 0 || BB.isLandingPad() ||
760                        BB.isEntryPoint());
761     }
762   }
763 
764   return NumDoubleJumps;
765 }
766 
767 bool SimplifyConditionalTailCalls::shouldRewriteBranch(
768     const BinaryBasicBlock *PredBB, const MCInst &CondBranch,
769     const BinaryBasicBlock *BB, const bool DirectionFlag) {
770   if (BeenOptimized.count(PredBB))
771     return false;
772 
773   const bool IsForward = BinaryFunction::isForwardBranch(PredBB, BB);
774 
775   if (IsForward)
776     ++NumOrigForwardBranches;
777   else
778     ++NumOrigBackwardBranches;
779 
780   if (opts::SctcMode == opts::SctcAlways)
781     return true;
782 
783   if (opts::SctcMode == opts::SctcPreserveDirection)
784     return IsForward == DirectionFlag;
785 
786   const ErrorOr<std::pair<double, double>> Frequency =
787       PredBB->getBranchStats(BB);
788 
789   // It's ok to rewrite the conditional branch if the new target will be
790   // a backward branch.
791 
792   // If no data available for these branches, then it should be ok to
793   // do the optimization since it will reduce code size.
794   if (Frequency.getError())
795     return true;
796 
797   // TODO: should this use misprediction frequency instead?
798   const bool Result = (IsForward && Frequency.get().first >= 0.5) ||
799                       (!IsForward && Frequency.get().first <= 0.5);
800 
801   return Result == DirectionFlag;
802 }
803 
804 uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) {
805   // Need updated indices to correctly detect branch' direction.
806   BF.getLayout().updateLayoutIndices();
807   BF.markUnreachableBlocks();
808 
809   MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get();
810   MCContext *Ctx = BF.getBinaryContext().Ctx.get();
811   uint64_t NumLocalCTCCandidates = 0;
812   uint64_t NumLocalCTCs = 0;
813   uint64_t LocalCTCTakenCount = 0;
814   uint64_t LocalCTCExecCount = 0;
815   std::vector<std::pair<BinaryBasicBlock *, const BinaryBasicBlock *>>
816       NeedsUncondBranch;
817 
818   // Will block be deleted by UCE?
819   auto isValid = [](const BinaryBasicBlock *BB) {
820     return (BB->pred_size() != 0 || BB->isLandingPad() || BB->isEntryPoint());
821   };
822 
823   for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
824     // Locate BB with a single direct tail-call instruction.
825     if (BB->getNumNonPseudos() != 1)
826       continue;
827 
828     MCInst *Instr = BB->getFirstNonPseudoInstr();
829     if (!MIB->isTailCall(*Instr) || MIB->isConditionalBranch(*Instr))
830       continue;
831 
832     const MCSymbol *CalleeSymbol = MIB->getTargetSymbol(*Instr);
833     if (!CalleeSymbol)
834       continue;
835 
836     // Detect direction of the possible conditional tail call.
837     const bool IsForwardCTC = BF.isForwardCall(CalleeSymbol);
838 
839     // Iterate through all predecessors.
840     for (BinaryBasicBlock *PredBB : BB->predecessors()) {
841       BinaryBasicBlock *CondSucc = PredBB->getConditionalSuccessor(true);
842       if (!CondSucc)
843         continue;
844 
845       ++NumLocalCTCCandidates;
846 
847       const MCSymbol *TBB = nullptr;
848       const MCSymbol *FBB = nullptr;
849       MCInst *CondBranch = nullptr;
850       MCInst *UncondBranch = nullptr;
851       bool Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
852 
853       // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz
854       if (!Result) {
855         LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n";
856                    PredBB->dump());
857         continue;
858       }
859 
860       assert(Result && "internal error analyzing conditional branch");
861       assert(CondBranch && "conditional branch expected");
862 
863       // It's possible that PredBB is also a successor to BB that may have
864       // been processed by a previous iteration of the SCTC loop, in which
865       // case it may have been marked invalid.  We should skip rewriting in
866       // this case.
867       if (!PredBB->isValid()) {
868         assert(PredBB->isSuccessor(BB) &&
869                "PredBB should be valid if it is not a successor to BB");
870         continue;
871       }
872 
873       // We don't want to reverse direction of the branch in new order
874       // without further profile analysis.
875       const bool DirectionFlag = CondSucc == BB ? IsForwardCTC : !IsForwardCTC;
876       if (!shouldRewriteBranch(PredBB, *CondBranch, BB, DirectionFlag))
877         continue;
878 
879       // Record this block so that we don't try to optimize it twice.
880       BeenOptimized.insert(PredBB);
881 
882       uint64_t Count = 0;
883       if (CondSucc != BB) {
884         // Patch the new target address into the conditional branch.
885         MIB->reverseBranchCondition(*CondBranch, CalleeSymbol, Ctx);
886         // Since we reversed the condition on the branch we need to change
887         // the target for the unconditional branch or add a unconditional
888         // branch to the old target.  This has to be done manually since
889         // fixupBranches is not called after SCTC.
890         NeedsUncondBranch.emplace_back(PredBB, CondSucc);
891         Count = PredBB->getFallthroughBranchInfo().Count;
892       } else {
893         // Change destination of the conditional branch.
894         MIB->replaceBranchTarget(*CondBranch, CalleeSymbol, Ctx);
895         Count = PredBB->getTakenBranchInfo().Count;
896       }
897       const uint64_t CTCTakenFreq =
898           Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count;
899 
900       // Annotate it, so "isCall" returns true for this jcc
901       MIB->setConditionalTailCall(*CondBranch);
902       // Add info about the conditional tail call frequency, otherwise this
903       // info will be lost when we delete the associated BranchInfo entry
904       auto &CTCAnnotation =
905           MIB->getOrCreateAnnotationAs<uint64_t>(*CondBranch, "CTCTakenCount");
906       CTCAnnotation = CTCTakenFreq;
907 
908       // Remove the unused successor which may be eliminated later
909       // if there are no other users.
910       PredBB->removeSuccessor(BB);
911       // Update BB execution count
912       if (CTCTakenFreq && CTCTakenFreq <= BB->getKnownExecutionCount())
913         BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq);
914       else if (CTCTakenFreq > BB->getKnownExecutionCount())
915         BB->setExecutionCount(0);
916 
917       ++NumLocalCTCs;
918       LocalCTCTakenCount += CTCTakenFreq;
919       LocalCTCExecCount += PredBB->getKnownExecutionCount();
920     }
921 
922     // Remove the block from CFG if all predecessors were removed.
923     BB->markValid(isValid(BB));
924   }
925 
926   // Add unconditional branches at the end of BBs to new successors
927   // as long as the successor is not a fallthrough.
928   for (auto &Entry : NeedsUncondBranch) {
929     BinaryBasicBlock *PredBB = Entry.first;
930     const BinaryBasicBlock *CondSucc = Entry.second;
931 
932     const MCSymbol *TBB = nullptr;
933     const MCSymbol *FBB = nullptr;
934     MCInst *CondBranch = nullptr;
935     MCInst *UncondBranch = nullptr;
936     PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
937 
938     // Find the next valid block.  Invalid blocks will be deleted
939     // so they shouldn't be considered fallthrough targets.
940     const BinaryBasicBlock *NextBlock =
941         BF.getLayout().getBasicBlockAfter(PredBB, false);
942     while (NextBlock && !isValid(NextBlock))
943       NextBlock = BF.getLayout().getBasicBlockAfter(NextBlock, false);
944 
945     // Get the unconditional successor to this block.
946     const BinaryBasicBlock *PredSucc = PredBB->getSuccessor();
947     assert(PredSucc && "The other branch should be a tail call");
948 
949     const bool HasFallthrough = (NextBlock && PredSucc == NextBlock);
950 
951     if (UncondBranch) {
952       if (HasFallthrough)
953         PredBB->eraseInstruction(PredBB->findInstruction(UncondBranch));
954       else
955         MIB->replaceBranchTarget(*UncondBranch, CondSucc->getLabel(), Ctx);
956     } else if (!HasFallthrough) {
957       MCInst Branch;
958       MIB->createUncondBranch(Branch, CondSucc->getLabel(), Ctx);
959       PredBB->addInstruction(Branch);
960     }
961   }
962 
963   if (NumLocalCTCs > 0) {
964     NumDoubleJumps += fixDoubleJumps(BF, true);
965     // Clean-up unreachable tail-call blocks.
966     const std::pair<unsigned, uint64_t> Stats = BF.eraseInvalidBBs();
967     DeletedBlocks += Stats.first;
968     DeletedBytes += Stats.second;
969 
970     assert(BF.validateCFG());
971   }
972 
973   LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs
974                     << " conditional tail calls from a total of "
975                     << NumLocalCTCCandidates << " candidates in function " << BF
976                     << ". CTCs execution count for this function is "
977                     << LocalCTCExecCount << " and CTC taken count is "
978                     << LocalCTCTakenCount << "\n";);
979 
980   NumTailCallsPatched += NumLocalCTCs;
981   NumCandidateTailCalls += NumLocalCTCCandidates;
982   CTCExecCount += LocalCTCExecCount;
983   CTCTakenCount += LocalCTCTakenCount;
984 
985   return NumLocalCTCs > 0;
986 }
987 
988 void SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) {
989   if (!BC.isX86())
990     return;
991 
992   for (auto &It : BC.getBinaryFunctions()) {
993     BinaryFunction &Function = It.second;
994 
995     if (!shouldOptimize(Function))
996       continue;
997 
998     if (fixTailCalls(Function)) {
999       Modified.insert(&Function);
1000       Function.setHasCanonicalCFG(false);
1001     }
1002   }
1003 
1004   if (NumTailCallsPatched)
1005     outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched
1006            << " tail calls (" << NumOrigForwardBranches << " forward)"
1007            << " tail calls (" << NumOrigBackwardBranches << " backward)"
1008            << " from a total of " << NumCandidateTailCalls << " while removing "
1009            << NumDoubleJumps << " double jumps"
1010            << " and removing " << DeletedBlocks << " basic blocks"
1011            << " totalling " << DeletedBytes
1012            << " bytes of code. CTCs total execution count is " << CTCExecCount
1013            << " and the number of times CTCs are taken is " << CTCTakenCount
1014            << "\n";
1015 }
1016 
1017 uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) {
1018   uint64_t Count = 0;
1019   const BinaryContext &BC = Function.getBinaryContext();
1020   for (BinaryBasicBlock &BB : Function) {
1021     for (MCInst &Inst : BB) {
1022       MCInst OriginalInst;
1023       if (opts::Verbosity > 2)
1024         OriginalInst = Inst;
1025 
1026       if (!BC.MIB->shortenInstruction(Inst, *BC.STI))
1027         continue;
1028 
1029       if (opts::Verbosity > 2) {
1030         BC.scopeLock();
1031         outs() << "BOLT-INFO: shortening:\nBOLT-INFO:    ";
1032         BC.printInstruction(outs(), OriginalInst, 0, &Function);
1033         outs() << "BOLT-INFO: to:";
1034         BC.printInstruction(outs(), Inst, 0, &Function);
1035       }
1036 
1037       ++Count;
1038     }
1039   }
1040 
1041   return Count;
1042 }
1043 
1044 void ShortenInstructions::runOnFunctions(BinaryContext &BC) {
1045   std::atomic<uint64_t> NumShortened{0};
1046   if (!BC.isX86())
1047     return;
1048 
1049   ParallelUtilities::runOnEachFunction(
1050       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR,
1051       [&](BinaryFunction &BF) { NumShortened += shortenInstructions(BF); },
1052       nullptr, "ShortenInstructions");
1053 
1054   if (NumShortened)
1055     outs() << "BOLT-INFO: " << NumShortened << " instructions were shortened\n";
1056 }
1057 
1058 void Peepholes::addTailcallTraps(BinaryFunction &Function) {
1059   MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
1060   for (BinaryBasicBlock &BB : Function) {
1061     MCInst *Inst = BB.getLastNonPseudoInstr();
1062     if (Inst && MIB->isTailCall(*Inst) && MIB->isIndirectBranch(*Inst)) {
1063       MCInst Trap;
1064       if (MIB->createTrap(Trap)) {
1065         BB.addInstruction(Trap);
1066         ++TailCallTraps;
1067       }
1068     }
1069   }
1070 }
1071 
1072 void Peepholes::removeUselessCondBranches(BinaryFunction &Function) {
1073   for (BinaryBasicBlock &BB : Function) {
1074     if (BB.succ_size() != 2)
1075       continue;
1076 
1077     BinaryBasicBlock *CondBB = BB.getConditionalSuccessor(true);
1078     BinaryBasicBlock *UncondBB = BB.getConditionalSuccessor(false);
1079     if (CondBB != UncondBB)
1080       continue;
1081 
1082     const MCSymbol *TBB = nullptr;
1083     const MCSymbol *FBB = nullptr;
1084     MCInst *CondBranch = nullptr;
1085     MCInst *UncondBranch = nullptr;
1086     bool Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
1087 
1088     // analyzeBranch() can fail due to unusual branch instructions,
1089     // e.g. jrcxz, or jump tables (indirect jump).
1090     if (!Result || !CondBranch)
1091       continue;
1092 
1093     BB.removeDuplicateConditionalSuccessor(CondBranch);
1094     ++NumUselessCondBranches;
1095   }
1096 }
1097 
1098 void Peepholes::runOnFunctions(BinaryContext &BC) {
1099   const char Opts =
1100       std::accumulate(opts::Peepholes.begin(), opts::Peepholes.end(), 0,
1101                       [](const char A, const PeepholeOpts B) { return A | B; });
1102   if (Opts == PEEP_NONE)
1103     return;
1104 
1105   for (auto &It : BC.getBinaryFunctions()) {
1106     BinaryFunction &Function = It.second;
1107     if (shouldOptimize(Function)) {
1108       if (Opts & PEEP_DOUBLE_JUMPS)
1109         NumDoubleJumps += fixDoubleJumps(Function, false);
1110       if (Opts & PEEP_TAILCALL_TRAPS)
1111         addTailcallTraps(Function);
1112       if (Opts & PEEP_USELESS_BRANCHES)
1113         removeUselessCondBranches(Function);
1114       assert(Function.validateCFG());
1115     }
1116   }
1117   outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps
1118          << " double jumps patched.\n"
1119          << "BOLT-INFO: Peephole: " << TailCallTraps
1120          << " tail call traps inserted.\n"
1121          << "BOLT-INFO: Peephole: " << NumUselessCondBranches
1122          << " useless conditional branches removed.\n";
1123 }
1124 
1125 bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) {
1126   BinaryContext &BC = BF.getBinaryContext();
1127   MCPlusBuilder *MIB = BC.MIB.get();
1128 
1129   uint64_t NumLocalLoadsSimplified = 0;
1130   uint64_t NumDynamicLocalLoadsSimplified = 0;
1131   uint64_t NumLocalLoadsFound = 0;
1132   uint64_t NumDynamicLocalLoadsFound = 0;
1133 
1134   for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
1135     for (MCInst &Inst : *BB) {
1136       unsigned Opcode = Inst.getOpcode();
1137       const MCInstrDesc &Desc = BC.MII->get(Opcode);
1138 
1139       // Skip instructions that do not load from memory.
1140       if (!Desc.mayLoad())
1141         continue;
1142 
1143       // Try to statically evaluate the target memory address;
1144       uint64_t TargetAddress;
1145 
1146       if (MIB->hasPCRelOperand(Inst)) {
1147         // Try to find the symbol that corresponds to the PC-relative operand.
1148         MCOperand *DispOpI = MIB->getMemOperandDisp(Inst);
1149         assert(DispOpI != Inst.end() && "expected PC-relative displacement");
1150         assert(DispOpI->isExpr() &&
1151                "found PC-relative with non-symbolic displacement");
1152 
1153         // Get displacement symbol.
1154         const MCSymbol *DisplSymbol;
1155         uint64_t DisplOffset;
1156 
1157         std::tie(DisplSymbol, DisplOffset) =
1158             MIB->getTargetSymbolInfo(DispOpI->getExpr());
1159 
1160         if (!DisplSymbol)
1161           continue;
1162 
1163         // Look up the symbol address in the global symbols map of the binary
1164         // context object.
1165         BinaryData *BD = BC.getBinaryDataByName(DisplSymbol->getName());
1166         if (!BD)
1167           continue;
1168         TargetAddress = BD->getAddress() + DisplOffset;
1169       } else if (!MIB->evaluateMemOperandTarget(Inst, TargetAddress)) {
1170         continue;
1171       }
1172 
1173       // Get the contents of the section containing the target address of the
1174       // memory operand. We are only interested in read-only sections.
1175       ErrorOr<BinarySection &> DataSection =
1176           BC.getSectionForAddress(TargetAddress);
1177       if (!DataSection || DataSection->isWritable())
1178         continue;
1179 
1180       if (BC.getRelocationAt(TargetAddress) ||
1181           BC.getDynamicRelocationAt(TargetAddress))
1182         continue;
1183 
1184       uint32_t Offset = TargetAddress - DataSection->getAddress();
1185       StringRef ConstantData = DataSection->getContents();
1186 
1187       ++NumLocalLoadsFound;
1188       if (BB->hasProfile())
1189         NumDynamicLocalLoadsFound += BB->getExecutionCount();
1190 
1191       if (MIB->replaceMemOperandWithImm(Inst, ConstantData, Offset)) {
1192         ++NumLocalLoadsSimplified;
1193         if (BB->hasProfile())
1194           NumDynamicLocalLoadsSimplified += BB->getExecutionCount();
1195       }
1196     }
1197   }
1198 
1199   NumLoadsFound += NumLocalLoadsFound;
1200   NumDynamicLoadsFound += NumDynamicLocalLoadsFound;
1201   NumLoadsSimplified += NumLocalLoadsSimplified;
1202   NumDynamicLoadsSimplified += NumDynamicLocalLoadsSimplified;
1203 
1204   return NumLocalLoadsSimplified > 0;
1205 }
1206 
1207 void SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) {
1208   for (auto &It : BC.getBinaryFunctions()) {
1209     BinaryFunction &Function = It.second;
1210     if (shouldOptimize(Function) && simplifyRODataLoads(Function))
1211       Modified.insert(&Function);
1212   }
1213 
1214   outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of "
1215          << NumLoadsFound << " loads from a statically computed address.\n"
1216          << "BOLT-INFO: dynamic loads simplified: " << NumDynamicLoadsSimplified
1217          << "\n"
1218          << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound << "\n";
1219 }
1220 
1221 void AssignSections::runOnFunctions(BinaryContext &BC) {
1222   for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) {
1223     Function->setCodeSectionName(BC.getInjectedCodeSectionName());
1224     Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName());
1225   }
1226 
1227   // In non-relocation mode functions have pre-assigned section names.
1228   if (!BC.HasRelocations)
1229     return;
1230 
1231   const bool UseColdSection =
1232       BC.NumProfiledFuncs > 0 ||
1233       opts::ReorderFunctions == ReorderFunctions::RT_USER;
1234   for (auto &BFI : BC.getBinaryFunctions()) {
1235     BinaryFunction &Function = BFI.second;
1236     if (opts::isHotTextMover(Function)) {
1237       Function.setCodeSectionName(BC.getHotTextMoverSectionName());
1238       Function.setColdCodeSectionName(BC.getHotTextMoverSectionName());
1239       continue;
1240     }
1241 
1242     if (!UseColdSection || Function.hasValidIndex())
1243       Function.setCodeSectionName(BC.getMainCodeSectionName());
1244     else
1245       Function.setCodeSectionName(BC.getColdCodeSectionName());
1246 
1247     if (Function.isSplit())
1248       Function.setColdCodeSectionName(BC.getColdCodeSectionName());
1249   }
1250 }
1251 
1252 void PrintProfileStats::runOnFunctions(BinaryContext &BC) {
1253   double FlowImbalanceMean = 0.0;
1254   size_t NumBlocksConsidered = 0;
1255   double WorstBias = 0.0;
1256   const BinaryFunction *WorstBiasFunc = nullptr;
1257 
1258   // For each function CFG, we fill an IncomingMap with the sum of the frequency
1259   // of incoming edges for each BB. Likewise for each OutgoingMap and the sum
1260   // of the frequency of outgoing edges.
1261   using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>;
1262   std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps;
1263   std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps;
1264 
1265   // Compute mean
1266   for (const auto &BFI : BC.getBinaryFunctions()) {
1267     const BinaryFunction &Function = BFI.second;
1268     if (Function.empty() || !Function.isSimple())
1269       continue;
1270     FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
1271     FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
1272     for (const BinaryBasicBlock &BB : Function) {
1273       uint64_t TotalOutgoing = 0ULL;
1274       auto SuccBIIter = BB.branch_info_begin();
1275       for (BinaryBasicBlock *Succ : BB.successors()) {
1276         uint64_t Count = SuccBIIter->Count;
1277         if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) {
1278           ++SuccBIIter;
1279           continue;
1280         }
1281         TotalOutgoing += Count;
1282         IncomingMap[Succ] += Count;
1283         ++SuccBIIter;
1284       }
1285       OutgoingMap[&BB] = TotalOutgoing;
1286     }
1287 
1288     size_t NumBlocks = 0;
1289     double Mean = 0.0;
1290     for (const BinaryBasicBlock &BB : Function) {
1291       // Do not compute score for low frequency blocks, entry or exit blocks
1292       if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint())
1293         continue;
1294       ++NumBlocks;
1295       const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
1296       Mean += fabs(Difference / IncomingMap[&BB]);
1297     }
1298 
1299     FlowImbalanceMean += Mean;
1300     NumBlocksConsidered += NumBlocks;
1301     if (!NumBlocks)
1302       continue;
1303     double FuncMean = Mean / NumBlocks;
1304     if (FuncMean > WorstBias) {
1305       WorstBias = FuncMean;
1306       WorstBiasFunc = &Function;
1307     }
1308   }
1309   if (NumBlocksConsidered > 0)
1310     FlowImbalanceMean /= NumBlocksConsidered;
1311 
1312   // Compute standard deviation
1313   NumBlocksConsidered = 0;
1314   double FlowImbalanceVar = 0.0;
1315   for (const auto &BFI : BC.getBinaryFunctions()) {
1316     const BinaryFunction &Function = BFI.second;
1317     if (Function.empty() || !Function.isSimple())
1318       continue;
1319     FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
1320     FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
1321     for (const BinaryBasicBlock &BB : Function) {
1322       if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0)
1323         continue;
1324       ++NumBlocksConsidered;
1325       const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
1326       FlowImbalanceVar +=
1327           pow(fabs(Difference / IncomingMap[&BB]) - FlowImbalanceMean, 2);
1328     }
1329   }
1330   if (NumBlocksConsidered) {
1331     FlowImbalanceVar /= NumBlocksConsidered;
1332     FlowImbalanceVar = sqrt(FlowImbalanceVar);
1333   }
1334 
1335   // Report to user
1336   outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n",
1337                    (100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar));
1338   if (WorstBiasFunc && opts::Verbosity >= 1) {
1339     outs() << "Worst average bias observed in " << WorstBiasFunc->getPrintName()
1340            << "\n";
1341     LLVM_DEBUG(WorstBiasFunc->dump());
1342   }
1343 }
1344 
1345 void PrintProgramStats::runOnFunctions(BinaryContext &BC) {
1346   uint64_t NumRegularFunctions = 0;
1347   uint64_t NumStaleProfileFunctions = 0;
1348   uint64_t NumAllStaleFunctions = 0;
1349   uint64_t NumInferredFunctions = 0;
1350   uint64_t NumNonSimpleProfiledFunctions = 0;
1351   uint64_t NumUnknownControlFlowFunctions = 0;
1352   uint64_t TotalSampleCount = 0;
1353   uint64_t StaleSampleCount = 0;
1354   uint64_t InferredSampleCount = 0;
1355   std::vector<const BinaryFunction *> ProfiledFunctions;
1356   const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n";
1357   for (auto &BFI : BC.getBinaryFunctions()) {
1358     const BinaryFunction &Function = BFI.second;
1359 
1360     // Ignore PLT functions for stats.
1361     if (Function.isPLTFunction())
1362       continue;
1363 
1364     ++NumRegularFunctions;
1365 
1366     if (!Function.isSimple()) {
1367       if (Function.hasProfile())
1368         ++NumNonSimpleProfiledFunctions;
1369       continue;
1370     }
1371 
1372     if (Function.hasUnknownControlFlow()) {
1373       if (opts::PrintUnknownCFG)
1374         Function.dump();
1375       else if (opts::PrintUnknown)
1376         errs() << "function with unknown control flow: " << Function << '\n';
1377 
1378       ++NumUnknownControlFlowFunctions;
1379     }
1380 
1381     if (!Function.hasProfile())
1382       continue;
1383 
1384     uint64_t SampleCount = Function.getRawBranchCount();
1385     TotalSampleCount += SampleCount;
1386 
1387     if (Function.hasValidProfile()) {
1388       ProfiledFunctions.push_back(&Function);
1389       if (Function.hasInferredProfile()) {
1390         ++NumInferredFunctions;
1391         InferredSampleCount += SampleCount;
1392         ++NumAllStaleFunctions;
1393       }
1394     } else {
1395       if (opts::ReportStaleFuncs) {
1396         outs() << StaleFuncsHeader;
1397         StaleFuncsHeader = "";
1398         outs() << "  " << Function << '\n';
1399       }
1400       ++NumStaleProfileFunctions;
1401       StaleSampleCount += SampleCount;
1402       ++NumAllStaleFunctions;
1403     }
1404   }
1405   BC.NumProfiledFuncs = ProfiledFunctions.size();
1406   BC.NumStaleProfileFuncs = NumStaleProfileFunctions;
1407 
1408   const size_t NumAllProfiledFunctions =
1409       ProfiledFunctions.size() + NumStaleProfileFunctions;
1410   outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of "
1411          << NumRegularFunctions << " functions in the binary ("
1412          << format("%.1f", NumAllProfiledFunctions /
1413                                (float)NumRegularFunctions * 100.0f)
1414          << "%) have non-empty execution profile\n";
1415   if (NumNonSimpleProfiledFunctions) {
1416     outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function"
1417            << (NumNonSimpleProfiledFunctions == 1 ? "" : "s")
1418            << " with profile could not be optimized\n";
1419   }
1420   if (NumAllStaleFunctions) {
1421     const float PctStale =
1422         NumAllStaleFunctions / (float)NumAllProfiledFunctions * 100.0f;
1423     const float PctStaleFuncsWithEqualBlockCount =
1424         (float)BC.Stats.NumStaleFuncsWithEqualBlockCount /
1425         NumAllStaleFunctions * 100.0f;
1426     const float PctStaleBlocksWithEqualIcount =
1427         (float)BC.Stats.NumStaleBlocksWithEqualIcount /
1428         BC.Stats.NumStaleBlocks * 100.0f;
1429     auto printErrorOrWarning = [&]() {
1430       if (PctStale > opts::StaleThreshold)
1431         errs() << "BOLT-ERROR: ";
1432       else
1433         errs() << "BOLT-WARNING: ";
1434     };
1435     printErrorOrWarning();
1436     errs() << NumAllStaleFunctions
1437            << format(" (%.1f%% of all profiled)", PctStale) << " function"
1438            << (NumAllStaleFunctions == 1 ? "" : "s")
1439            << " have invalid (possibly stale) profile."
1440               " Use -report-stale to see the list.\n";
1441     if (TotalSampleCount > 0) {
1442       printErrorOrWarning();
1443       errs() << (StaleSampleCount + InferredSampleCount) << " out of "
1444              << TotalSampleCount << " samples in the binary ("
1445              << format("%.1f",
1446                        ((100.0f * (StaleSampleCount + InferredSampleCount)) /
1447                         TotalSampleCount))
1448              << "%) belong to functions with invalid"
1449                 " (possibly stale) profile.\n";
1450     }
1451     outs() << "BOLT-INFO: " << BC.Stats.NumStaleFuncsWithEqualBlockCount
1452            << " stale function"
1453            << (BC.Stats.NumStaleFuncsWithEqualBlockCount == 1 ? "" : "s")
1454            << format(" (%.1f%% of all stale)", PctStaleFuncsWithEqualBlockCount)
1455            << " have matching block count.\n";
1456     outs() << "BOLT-INFO: " << BC.Stats.NumStaleBlocksWithEqualIcount
1457            << " stale block"
1458            << (BC.Stats.NumStaleBlocksWithEqualIcount == 1 ? "" : "s")
1459            << format(" (%.1f%% of all stale)", PctStaleBlocksWithEqualIcount)
1460            << " have matching icount.\n";
1461     if (PctStale > opts::StaleThreshold) {
1462       errs() << "BOLT-ERROR: stale functions exceed specified threshold of "
1463              << opts::StaleThreshold << "%. Exiting.\n";
1464       exit(1);
1465     }
1466   }
1467   if (NumInferredFunctions) {
1468     outs() << format("BOLT-INFO: inferred profile for %d (%.2f%% of profiled, "
1469                      "%.2f%% of stale) functions responsible for %.2f%% samples"
1470                      " (%zu out of %zu)\n",
1471                      NumInferredFunctions,
1472                      100.0 * NumInferredFunctions / NumAllProfiledFunctions,
1473                      100.0 * NumInferredFunctions / NumAllStaleFunctions,
1474                      100.0 * InferredSampleCount / TotalSampleCount,
1475                      InferredSampleCount, TotalSampleCount);
1476     outs() << format(
1477         "BOLT-INFO: inference found an exact match for %.2f%% of basic blocks"
1478         " (%zu out of %zu stale) responsible for %.2f%% samples"
1479         " (%zu out of %zu stale)\n",
1480         100.0 * BC.Stats.NumMatchedBlocks / BC.Stats.NumStaleBlocks,
1481         BC.Stats.NumMatchedBlocks, BC.Stats.NumStaleBlocks,
1482         100.0 * BC.Stats.MatchedSampleCount / BC.Stats.StaleSampleCount,
1483         BC.Stats.MatchedSampleCount, BC.Stats.StaleSampleCount);
1484   }
1485 
1486   if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) {
1487     outs() << "BOLT-INFO: profile for " << NumUnusedObjects
1488            << " objects was ignored\n";
1489   }
1490 
1491   if (ProfiledFunctions.size() > 10) {
1492     if (opts::Verbosity >= 1) {
1493       outs() << "BOLT-INFO: top called functions are:\n";
1494       llvm::sort(ProfiledFunctions,
1495                  [](const BinaryFunction *A, const BinaryFunction *B) {
1496                    return B->getExecutionCount() < A->getExecutionCount();
1497                  });
1498       auto SFI = ProfiledFunctions.begin();
1499       auto SFIend = ProfiledFunctions.end();
1500       for (unsigned I = 0u; I < opts::TopCalledLimit && SFI != SFIend;
1501            ++SFI, ++I)
1502         outs() << "  " << **SFI << " : " << (*SFI)->getExecutionCount() << '\n';
1503     }
1504   }
1505 
1506   if (!opts::PrintSortedBy.empty()) {
1507     std::vector<BinaryFunction *> Functions;
1508     std::map<const BinaryFunction *, DynoStats> Stats;
1509 
1510     for (auto &BFI : BC.getBinaryFunctions()) {
1511       BinaryFunction &BF = BFI.second;
1512       if (shouldOptimize(BF) && BF.hasValidProfile()) {
1513         Functions.push_back(&BF);
1514         Stats.emplace(&BF, getDynoStats(BF));
1515       }
1516     }
1517 
1518     const bool SortAll =
1519         llvm::is_contained(opts::PrintSortedBy, DynoStats::LAST_DYNO_STAT);
1520 
1521     const bool Ascending =
1522         opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending;
1523 
1524     if (SortAll) {
1525       llvm::stable_sort(Functions,
1526                         [Ascending, &Stats](const BinaryFunction *A,
1527                                             const BinaryFunction *B) {
1528                           return Ascending ? Stats.at(A) < Stats.at(B)
1529                                            : Stats.at(B) < Stats.at(A);
1530                         });
1531     } else {
1532       llvm::stable_sort(
1533           Functions, [Ascending, &Stats](const BinaryFunction *A,
1534                                          const BinaryFunction *B) {
1535             const DynoStats &StatsA = Stats.at(A);
1536             const DynoStats &StatsB = Stats.at(B);
1537             return Ascending ? StatsA.lessThan(StatsB, opts::PrintSortedBy)
1538                              : StatsB.lessThan(StatsA, opts::PrintSortedBy);
1539           });
1540     }
1541 
1542     outs() << "BOLT-INFO: top functions sorted by ";
1543     if (SortAll) {
1544       outs() << "dyno stats";
1545     } else {
1546       outs() << "(";
1547       bool PrintComma = false;
1548       for (const DynoStats::Category Category : opts::PrintSortedBy) {
1549         if (PrintComma)
1550           outs() << ", ";
1551         outs() << DynoStats::Description(Category);
1552         PrintComma = true;
1553       }
1554       outs() << ")";
1555     }
1556 
1557     outs() << " are:\n";
1558     auto SFI = Functions.begin();
1559     for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) {
1560       const DynoStats Stats = getDynoStats(**SFI);
1561       outs() << "  " << **SFI;
1562       if (!SortAll) {
1563         outs() << " (";
1564         bool PrintComma = false;
1565         for (const DynoStats::Category Category : opts::PrintSortedBy) {
1566           if (PrintComma)
1567             outs() << ", ";
1568           outs() << dynoStatsOptName(Category) << "=" << Stats[Category];
1569           PrintComma = true;
1570         }
1571         outs() << ")";
1572       }
1573       outs() << "\n";
1574     }
1575   }
1576 
1577   if (!BC.TrappedFunctions.empty()) {
1578     errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function"
1579            << (BC.TrappedFunctions.size() > 1 ? "s" : "")
1580            << " will trap on entry. Use -trap-avx512=0 to disable"
1581               " traps.";
1582     if (opts::Verbosity >= 1 || BC.TrappedFunctions.size() <= 5) {
1583       errs() << '\n';
1584       for (const BinaryFunction *Function : BC.TrappedFunctions)
1585         errs() << "  " << *Function << '\n';
1586     } else {
1587       errs() << " Use -v=1 to see the list.\n";
1588     }
1589   }
1590 
1591   // Print information on missed macro-fusion opportunities seen on input.
1592   if (BC.Stats.MissedMacroFusionPairs) {
1593     outs() << format("BOLT-INFO: the input contains %zu (dynamic count : %zu)"
1594                      " opportunities for macro-fusion optimization",
1595                      BC.Stats.MissedMacroFusionPairs,
1596                      BC.Stats.MissedMacroFusionExecCount);
1597     switch (opts::AlignMacroOpFusion) {
1598     case MFT_NONE:
1599       outs() << ". Use -align-macro-fusion to fix.\n";
1600       break;
1601     case MFT_HOT:
1602       outs() << ". Will fix instances on a hot path.\n";
1603       break;
1604     case MFT_ALL:
1605       outs() << " that are going to be fixed\n";
1606       break;
1607     }
1608   }
1609 
1610   // Collect and print information about suboptimal code layout on input.
1611   if (opts::ReportBadLayout) {
1612     std::vector<BinaryFunction *> SuboptimalFuncs;
1613     for (auto &BFI : BC.getBinaryFunctions()) {
1614       BinaryFunction &BF = BFI.second;
1615       if (!BF.hasValidProfile())
1616         continue;
1617 
1618       const uint64_t HotThreshold =
1619           std::max<uint64_t>(BF.getKnownExecutionCount(), 1);
1620       bool HotSeen = false;
1621       for (const BinaryBasicBlock *BB : BF.getLayout().rblocks()) {
1622         if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) {
1623           HotSeen = true;
1624           continue;
1625         }
1626         if (HotSeen && BB->getKnownExecutionCount() == 0) {
1627           SuboptimalFuncs.push_back(&BF);
1628           break;
1629         }
1630       }
1631     }
1632 
1633     if (!SuboptimalFuncs.empty()) {
1634       llvm::sort(SuboptimalFuncs,
1635                  [](const BinaryFunction *A, const BinaryFunction *B) {
1636                    return A->getKnownExecutionCount() / A->getSize() >
1637                           B->getKnownExecutionCount() / B->getSize();
1638                  });
1639 
1640       outs() << "BOLT-INFO: " << SuboptimalFuncs.size()
1641              << " functions have "
1642                 "cold code in the middle of hot code. Top functions are:\n";
1643       for (unsigned I = 0;
1644            I < std::min(static_cast<size_t>(opts::ReportBadLayout),
1645                         SuboptimalFuncs.size());
1646            ++I)
1647         SuboptimalFuncs[I]->print(outs());
1648     }
1649   }
1650 
1651   if (NumUnknownControlFlowFunctions) {
1652     outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions
1653            << " functions have instructions with unknown control flow";
1654     if (!opts::PrintUnknown)
1655       outs() << ". Use -print-unknown to see the list.";
1656     outs() << '\n';
1657   }
1658 }
1659 
1660 void InstructionLowering::runOnFunctions(BinaryContext &BC) {
1661   for (auto &BFI : BC.getBinaryFunctions())
1662     for (BinaryBasicBlock &BB : BFI.second)
1663       for (MCInst &Instruction : BB)
1664         BC.MIB->lowerTailCall(Instruction);
1665 }
1666 
1667 void StripRepRet::runOnFunctions(BinaryContext &BC) {
1668   if (!BC.isX86())
1669     return;
1670 
1671   uint64_t NumPrefixesRemoved = 0;
1672   uint64_t NumBytesSaved = 0;
1673   for (auto &BFI : BC.getBinaryFunctions()) {
1674     for (BinaryBasicBlock &BB : BFI.second) {
1675       auto LastInstRIter = BB.getLastNonPseudo();
1676       if (LastInstRIter == BB.rend() || !BC.MIB->isReturn(*LastInstRIter) ||
1677           !BC.MIB->deleteREPPrefix(*LastInstRIter))
1678         continue;
1679 
1680       NumPrefixesRemoved += BB.getKnownExecutionCount();
1681       ++NumBytesSaved;
1682     }
1683   }
1684 
1685   if (NumBytesSaved)
1686     outs() << "BOLT-INFO: removed " << NumBytesSaved
1687            << " 'repz' prefixes"
1688               " with estimated execution count of "
1689            << NumPrefixesRemoved << " times.\n";
1690 }
1691 
1692 void InlineMemcpy::runOnFunctions(BinaryContext &BC) {
1693   if (!BC.isX86())
1694     return;
1695 
1696   uint64_t NumInlined = 0;
1697   uint64_t NumInlinedDyno = 0;
1698   for (auto &BFI : BC.getBinaryFunctions()) {
1699     for (BinaryBasicBlock &BB : BFI.second) {
1700       for (auto II = BB.begin(); II != BB.end(); ++II) {
1701         MCInst &Inst = *II;
1702 
1703         if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
1704             !Inst.getOperand(0).isExpr())
1705           continue;
1706 
1707         const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
1708         if (CalleeSymbol->getName() != "memcpy" &&
1709             CalleeSymbol->getName() != "memcpy@PLT" &&
1710             CalleeSymbol->getName() != "_memcpy8")
1711           continue;
1712 
1713         const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
1714         const bool IsTailCall = BC.MIB->isTailCall(Inst);
1715 
1716         const InstructionListType NewCode =
1717             BC.MIB->createInlineMemcpy(IsMemcpy8);
1718         II = BB.replaceInstruction(II, NewCode);
1719         std::advance(II, NewCode.size() - 1);
1720         if (IsTailCall) {
1721           MCInst Return;
1722           BC.MIB->createReturn(Return);
1723           II = BB.insertInstruction(std::next(II), std::move(Return));
1724         }
1725 
1726         ++NumInlined;
1727         NumInlinedDyno += BB.getKnownExecutionCount();
1728       }
1729     }
1730   }
1731 
1732   if (NumInlined) {
1733     outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls";
1734     if (NumInlinedDyno)
1735       outs() << ". The calls were executed " << NumInlinedDyno
1736              << " times based on profile.";
1737     outs() << '\n';
1738   }
1739 }
1740 
1741 bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const {
1742   if (!BinaryFunctionPass::shouldOptimize(Function))
1743     return false;
1744 
1745   for (const std::string &FunctionSpec : Spec) {
1746     StringRef FunctionName = StringRef(FunctionSpec).split(':').first;
1747     if (Function.hasNameRegex(FunctionName))
1748       return true;
1749   }
1750 
1751   return false;
1752 }
1753 
1754 std::set<size_t> SpecializeMemcpy1::getCallSitesToOptimize(
1755     const BinaryFunction &Function) const {
1756   StringRef SitesString;
1757   for (const std::string &FunctionSpec : Spec) {
1758     StringRef FunctionName;
1759     std::tie(FunctionName, SitesString) = StringRef(FunctionSpec).split(':');
1760     if (Function.hasNameRegex(FunctionName))
1761       break;
1762     SitesString = "";
1763   }
1764 
1765   std::set<size_t> Sites;
1766   SmallVector<StringRef, 4> SitesVec;
1767   SitesString.split(SitesVec, ':');
1768   for (StringRef SiteString : SitesVec) {
1769     if (SiteString.empty())
1770       continue;
1771     size_t Result;
1772     if (!SiteString.getAsInteger(10, Result))
1773       Sites.emplace(Result);
1774   }
1775 
1776   return Sites;
1777 }
1778 
1779 void SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) {
1780   if (!BC.isX86())
1781     return;
1782 
1783   uint64_t NumSpecialized = 0;
1784   uint64_t NumSpecializedDyno = 0;
1785   for (auto &BFI : BC.getBinaryFunctions()) {
1786     BinaryFunction &Function = BFI.second;
1787     if (!shouldOptimize(Function))
1788       continue;
1789 
1790     std::set<size_t> CallsToOptimize = getCallSitesToOptimize(Function);
1791     auto shouldOptimize = [&](size_t N) {
1792       return CallsToOptimize.empty() || CallsToOptimize.count(N);
1793     };
1794 
1795     std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend());
1796     size_t CallSiteID = 0;
1797     for (BinaryBasicBlock *CurBB : Blocks) {
1798       for (auto II = CurBB->begin(); II != CurBB->end(); ++II) {
1799         MCInst &Inst = *II;
1800 
1801         if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
1802             !Inst.getOperand(0).isExpr())
1803           continue;
1804 
1805         const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
1806         if (CalleeSymbol->getName() != "memcpy" &&
1807             CalleeSymbol->getName() != "memcpy@PLT")
1808           continue;
1809 
1810         if (BC.MIB->isTailCall(Inst))
1811           continue;
1812 
1813         ++CallSiteID;
1814 
1815         if (!shouldOptimize(CallSiteID))
1816           continue;
1817 
1818         // Create a copy of a call to memcpy(dest, src, size).
1819         MCInst MemcpyInstr = Inst;
1820 
1821         BinaryBasicBlock *OneByteMemcpyBB = CurBB->splitAt(II);
1822 
1823         BinaryBasicBlock *NextBB = nullptr;
1824         if (OneByteMemcpyBB->getNumNonPseudos() > 1) {
1825           NextBB = OneByteMemcpyBB->splitAt(OneByteMemcpyBB->begin());
1826           NextBB->eraseInstruction(NextBB->begin());
1827         } else {
1828           NextBB = OneByteMemcpyBB->getSuccessor();
1829           OneByteMemcpyBB->eraseInstruction(OneByteMemcpyBB->begin());
1830           assert(NextBB && "unexpected call to memcpy() with no return");
1831         }
1832 
1833         BinaryBasicBlock *MemcpyBB = Function.addBasicBlock();
1834         MemcpyBB->setOffset(CurBB->getInputOffset());
1835         InstructionListType CmpJCC =
1836             BC.MIB->createCmpJE(BC.MIB->getIntArgRegister(2), 1,
1837                                 OneByteMemcpyBB->getLabel(), BC.Ctx.get());
1838         CurBB->addInstructions(CmpJCC);
1839         CurBB->addSuccessor(MemcpyBB);
1840 
1841         MemcpyBB->addInstruction(std::move(MemcpyInstr));
1842         MemcpyBB->addSuccessor(NextBB);
1843         MemcpyBB->setCFIState(NextBB->getCFIState());
1844         MemcpyBB->setExecutionCount(0);
1845 
1846         // To prevent the actual call from being moved to cold, we set its
1847         // execution count to 1.
1848         if (CurBB->getKnownExecutionCount() > 0)
1849           MemcpyBB->setExecutionCount(1);
1850 
1851         InstructionListType OneByteMemcpy = BC.MIB->createOneByteMemcpy();
1852         OneByteMemcpyBB->addInstructions(OneByteMemcpy);
1853 
1854         ++NumSpecialized;
1855         NumSpecializedDyno += CurBB->getKnownExecutionCount();
1856 
1857         CurBB = NextBB;
1858 
1859         // Note: we don't expect the next instruction to be a call to memcpy.
1860         II = CurBB->begin();
1861       }
1862     }
1863   }
1864 
1865   if (NumSpecialized) {
1866     outs() << "BOLT-INFO: specialized " << NumSpecialized
1867            << " memcpy() call sites for size 1";
1868     if (NumSpecializedDyno)
1869       outs() << ". The calls were executed " << NumSpecializedDyno
1870              << " times based on profile.";
1871     outs() << '\n';
1872   }
1873 }
1874 
1875 void RemoveNops::runOnFunction(BinaryFunction &BF) {
1876   const BinaryContext &BC = BF.getBinaryContext();
1877   for (BinaryBasicBlock &BB : BF) {
1878     for (int64_t I = BB.size() - 1; I >= 0; --I) {
1879       MCInst &Inst = BB.getInstructionAtIndex(I);
1880       if (BC.MIB->isNoop(Inst) && BC.MIB->hasAnnotation(Inst, "NOP"))
1881         BB.eraseInstructionAtIndex(I);
1882     }
1883   }
1884 }
1885 
1886 void RemoveNops::runOnFunctions(BinaryContext &BC) {
1887   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
1888     runOnFunction(BF);
1889   };
1890 
1891   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
1892     return BF.shouldPreserveNops();
1893   };
1894 
1895   ParallelUtilities::runOnEachFunction(
1896       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
1897       SkipFunc, "RemoveNops");
1898 }
1899 
1900 } // namespace bolt
1901 } // namespace llvm
1902