xref: /llvm-project/bolt/lib/Passes/BinaryPasses.cpp (revision a5f3d1a803020167bd9d494a8a3921e7dcc1550a)
1 //===- bolt/Passes/BinaryPasses.cpp - Binary-level passes -----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements multiple passes for binary optimization and analysis.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "bolt/Passes/BinaryPasses.h"
14 #include "bolt/Core/FunctionLayout.h"
15 #include "bolt/Core/ParallelUtilities.h"
16 #include "bolt/Passes/ReorderAlgorithm.h"
17 #include "bolt/Passes/ReorderFunctions.h"
18 #include "llvm/Support/CommandLine.h"
19 #include <atomic>
20 #include <mutex>
21 #include <numeric>
22 #include <vector>
23 
24 #define DEBUG_TYPE "bolt-opts"
25 
26 using namespace llvm;
27 using namespace bolt;
28 
29 static const char *dynoStatsOptName(const bolt::DynoStats::Category C) {
30   assert(C > bolt::DynoStats::FIRST_DYNO_STAT &&
31          C < DynoStats::LAST_DYNO_STAT && "Unexpected dyno stat category.");
32 
33   static std::string OptNames[bolt::DynoStats::LAST_DYNO_STAT + 1];
34 
35   OptNames[C] = bolt::DynoStats::Description(C);
36 
37   std::replace(OptNames[C].begin(), OptNames[C].end(), ' ', '-');
38 
39   return OptNames[C].c_str();
40 }
41 
42 namespace opts {
43 
44 extern cl::OptionCategory BoltCategory;
45 extern cl::OptionCategory BoltOptCategory;
46 
47 extern cl::opt<bolt::MacroFusionType> AlignMacroOpFusion;
48 extern cl::opt<unsigned> Verbosity;
49 extern cl::opt<bool> EnableBAT;
50 extern cl::opt<unsigned> ExecutionCountThreshold;
51 extern cl::opt<bool> UpdateDebugSections;
52 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions;
53 
54 enum DynoStatsSortOrder : char {
55   Ascending,
56   Descending
57 };
58 
59 static cl::opt<DynoStatsSortOrder> DynoStatsSortOrderOpt(
60     "print-sorted-by-order",
61     cl::desc("use ascending or descending order when printing functions "
62              "ordered by dyno stats"),
63     cl::init(DynoStatsSortOrder::Descending), cl::cat(BoltOptCategory));
64 
65 cl::list<std::string>
66 HotTextMoveSections("hot-text-move-sections",
67   cl::desc("list of sections containing functions used for hugifying hot text. "
68            "BOLT makes sure these functions are not placed on the same page as "
69            "the hot text. (default=\'.stub,.mover\')."),
70   cl::value_desc("sec1,sec2,sec3,..."),
71   cl::CommaSeparated,
72   cl::ZeroOrMore,
73   cl::cat(BoltCategory));
74 
75 bool isHotTextMover(const BinaryFunction &Function) {
76   for (std::string &SectionName : opts::HotTextMoveSections) {
77     if (Function.getOriginSectionName() &&
78         *Function.getOriginSectionName() == SectionName)
79       return true;
80   }
81 
82   return false;
83 }
84 
85 static cl::opt<bool> MinBranchClusters(
86     "min-branch-clusters",
87     cl::desc("use a modified clustering algorithm geared towards minimizing "
88              "branches"),
89     cl::Hidden, cl::cat(BoltOptCategory));
90 
91 static cl::list<Peepholes::PeepholeOpts> Peepholes(
92     "peepholes", cl::CommaSeparated, cl::desc("enable peephole optimizations"),
93     cl::value_desc("opt1,opt2,opt3,..."),
94     cl::values(clEnumValN(Peepholes::PEEP_NONE, "none", "disable peepholes"),
95                clEnumValN(Peepholes::PEEP_DOUBLE_JUMPS, "double-jumps",
96                           "remove double jumps when able"),
97                clEnumValN(Peepholes::PEEP_TAILCALL_TRAPS, "tailcall-traps",
98                           "insert tail call traps"),
99                clEnumValN(Peepholes::PEEP_USELESS_BRANCHES, "useless-branches",
100                           "remove useless conditional branches"),
101                clEnumValN(Peepholes::PEEP_ALL, "all",
102                           "enable all peephole optimizations")),
103     cl::ZeroOrMore, cl::cat(BoltOptCategory));
104 
105 static cl::opt<unsigned>
106     PrintFuncStat("print-function-statistics",
107                   cl::desc("print statistics about basic block ordering"),
108                   cl::init(0), cl::cat(BoltOptCategory));
109 
110 static cl::list<bolt::DynoStats::Category>
111     PrintSortedBy("print-sorted-by", cl::CommaSeparated,
112                   cl::desc("print functions sorted by order of dyno stats"),
113                   cl::value_desc("key1,key2,key3,..."),
114                   cl::values(
115 #define D(name, description, ...)                                              \
116   clEnumValN(bolt::DynoStats::name, dynoStatsOptName(bolt::DynoStats::name),   \
117              description),
118                       REAL_DYNO_STATS
119 #undef D
120                           clEnumValN(bolt::DynoStats::LAST_DYNO_STAT, "all",
121                                      "sorted by all names")),
122                   cl::ZeroOrMore, cl::cat(BoltOptCategory));
123 
124 static cl::opt<bool>
125     PrintUnknown("print-unknown",
126                  cl::desc("print names of functions with unknown control flow"),
127                  cl::cat(BoltCategory), cl::Hidden);
128 
129 static cl::opt<bool>
130     PrintUnknownCFG("print-unknown-cfg",
131                     cl::desc("dump CFG of functions with unknown control flow"),
132                     cl::cat(BoltCategory), cl::ReallyHidden);
133 
134 // Please MSVC19 with a forward declaration: otherwise it reports an error about
135 // an undeclared variable inside a callback.
136 extern cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks;
137 cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks(
138     "reorder-blocks", cl::desc("change layout of basic blocks in a function"),
139     cl::init(bolt::ReorderBasicBlocks::LT_NONE),
140     cl::values(
141         clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, "none",
142                    "do not reorder basic blocks"),
143         clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, "reverse",
144                    "layout blocks in reverse order"),
145         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, "normal",
146                    "perform optimal layout based on profile"),
147         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH,
148                    "branch-predictor",
149                    "perform optimal layout prioritizing branch "
150                    "predictions"),
151         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, "cache",
152                    "perform optimal layout prioritizing I-cache "
153                    "behavior"),
154         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS, "cache+",
155                    "perform layout optimizing I-cache behavior"),
156         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "ext-tsp",
157                    "perform layout optimizing I-cache behavior"),
158         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE,
159                    "cluster-shuffle", "perform random layout of clusters")),
160     cl::ZeroOrMore, cl::cat(BoltOptCategory),
161     cl::callback([](const bolt::ReorderBasicBlocks::LayoutType &option) {
162       if (option == bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS) {
163         errs() << "BOLT-WARNING: '-reorder-blocks=cache+' is deprecated, please"
164                << " use '-reorder-blocks=ext-tsp' instead\n";
165         ReorderBlocks = bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP;
166       }
167     }));
168 
169 static cl::opt<unsigned> ReportBadLayout(
170     "report-bad-layout",
171     cl::desc("print top <uint> functions with suboptimal code layout on input"),
172     cl::init(0), cl::Hidden, cl::cat(BoltOptCategory));
173 
174 static cl::opt<bool>
175     ReportStaleFuncs("report-stale",
176                      cl::desc("print the list of functions with stale profile"),
177                      cl::Hidden, cl::cat(BoltOptCategory));
178 
179 enum SctcModes : char {
180   SctcAlways,
181   SctcPreserveDirection,
182   SctcHeuristic
183 };
184 
185 static cl::opt<SctcModes>
186 SctcMode("sctc-mode",
187   cl::desc("mode for simplify conditional tail calls"),
188   cl::init(SctcAlways),
189   cl::values(clEnumValN(SctcAlways, "always", "always perform sctc"),
190     clEnumValN(SctcPreserveDirection,
191       "preserve",
192       "only perform sctc when branch direction is "
193       "preserved"),
194     clEnumValN(SctcHeuristic,
195       "heuristic",
196       "use branch prediction data to control sctc")),
197   cl::ZeroOrMore,
198   cl::cat(BoltOptCategory));
199 
200 static cl::opt<unsigned>
201 StaleThreshold("stale-threshold",
202     cl::desc(
203       "maximum percentage of stale functions to tolerate (default: 100)"),
204     cl::init(100),
205     cl::Hidden,
206     cl::cat(BoltOptCategory));
207 
208 static cl::opt<unsigned> TSPThreshold(
209     "tsp-threshold",
210     cl::desc(
211         "maximum number of hot basic blocks in a function for which to use "
212         "a precise TSP solution while re-ordering basic blocks"),
213     cl::init(10), cl::Hidden, cl::cat(BoltOptCategory));
214 
215 static cl::opt<unsigned> TopCalledLimit(
216     "top-called-limit",
217     cl::desc("maximum number of functions to print in top called "
218              "functions section"),
219     cl::init(100), cl::Hidden, cl::cat(BoltCategory));
220 
221 } // namespace opts
222 
223 namespace llvm {
224 namespace bolt {
225 
226 bool BinaryFunctionPass::shouldOptimize(const BinaryFunction &BF) const {
227   return BF.isSimple() && BF.getState() == BinaryFunction::State::CFG &&
228          !BF.isIgnored();
229 }
230 
231 bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const {
232   return BF.isSimple() && !BF.isIgnored();
233 }
234 
235 void NormalizeCFG::runOnFunction(BinaryFunction &BF) {
236   uint64_t NumRemoved = 0;
237   uint64_t NumDuplicateEdges = 0;
238   uint64_t NeedsFixBranches = 0;
239   for (BinaryBasicBlock &BB : BF) {
240     if (!BB.empty())
241       continue;
242 
243     if (BB.isEntryPoint() || BB.isLandingPad())
244       continue;
245 
246     // Handle a dangling empty block.
247     if (BB.succ_size() == 0) {
248       // If an empty dangling basic block has a predecessor, it could be a
249       // result of codegen for __builtin_unreachable. In such case, do not
250       // remove the block.
251       if (BB.pred_size() == 0) {
252         BB.markValid(false);
253         ++NumRemoved;
254       }
255       continue;
256     }
257 
258     // The block should have just one successor.
259     BinaryBasicBlock *Successor = BB.getSuccessor();
260     assert(Successor && "invalid CFG encountered");
261 
262     // Redirect all predecessors to the successor block.
263     while (!BB.pred_empty()) {
264       BinaryBasicBlock *Predecessor = *BB.pred_begin();
265       if (Predecessor->hasJumpTable())
266         break;
267 
268       if (Predecessor == Successor)
269         break;
270 
271       BinaryBasicBlock::BinaryBranchInfo &BI = Predecessor->getBranchInfo(BB);
272       Predecessor->replaceSuccessor(&BB, Successor, BI.Count,
273                                     BI.MispredictedCount);
274       // We need to fix branches even if we failed to replace all successors
275       // and remove the block.
276       NeedsFixBranches = true;
277     }
278 
279     if (BB.pred_empty()) {
280       BB.removeAllSuccessors();
281       BB.markValid(false);
282       ++NumRemoved;
283     }
284   }
285 
286   if (NumRemoved)
287     BF.eraseInvalidBBs();
288 
289   // Check for duplicate successors. Do it after the empty block elimination as
290   // we can get more duplicate successors.
291   for (BinaryBasicBlock &BB : BF)
292     if (!BB.hasJumpTable() && BB.succ_size() == 2 &&
293         BB.getConditionalSuccessor(false) == BB.getConditionalSuccessor(true))
294       ++NumDuplicateEdges;
295 
296   // fixBranches() will get rid of duplicate edges and update jump instructions.
297   if (NumDuplicateEdges || NeedsFixBranches)
298     BF.fixBranches();
299 
300   NumDuplicateEdgesMerged += NumDuplicateEdges;
301   NumBlocksRemoved += NumRemoved;
302 }
303 
304 Error NormalizeCFG::runOnFunctions(BinaryContext &BC) {
305   ParallelUtilities::runOnEachFunction(
306       BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR,
307       [&](BinaryFunction &BF) { runOnFunction(BF); },
308       [&](const BinaryFunction &BF) { return !shouldOptimize(BF); },
309       "NormalizeCFG");
310   if (NumBlocksRemoved)
311     outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block"
312            << (NumBlocksRemoved == 1 ? "" : "s") << '\n';
313   if (NumDuplicateEdgesMerged)
314     outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged
315            << " duplicate CFG edge" << (NumDuplicateEdgesMerged == 1 ? "" : "s")
316            << '\n';
317   return Error::success();
318 }
319 
320 void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) {
321   BinaryContext &BC = Function.getBinaryContext();
322   unsigned Count;
323   uint64_t Bytes;
324   Function.markUnreachableBlocks();
325   LLVM_DEBUG({
326     for (BinaryBasicBlock &BB : Function) {
327       if (!BB.isValid()) {
328         dbgs() << "BOLT-INFO: UCE found unreachable block " << BB.getName()
329                << " in function " << Function << "\n";
330         Function.dump();
331       }
332     }
333   });
334   BinaryContext::IndependentCodeEmitter Emitter =
335       BC.createIndependentMCCodeEmitter();
336   std::tie(Count, Bytes) = Function.eraseInvalidBBs(Emitter.MCE.get());
337   DeletedBlocks += Count;
338   DeletedBytes += Bytes;
339   if (Count) {
340     auto L = BC.scopeLock();
341     Modified.insert(&Function);
342     if (opts::Verbosity > 0)
343       outs() << "BOLT-INFO: removed " << Count
344              << " dead basic block(s) accounting for " << Bytes
345              << " bytes in function " << Function << '\n';
346   }
347 }
348 
349 Error EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) {
350   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
351     runOnFunction(BF);
352   };
353 
354   ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
355     return !shouldOptimize(BF) || BF.getLayout().block_empty();
356   };
357 
358   ParallelUtilities::runOnEachFunction(
359       BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun,
360       SkipPredicate, "elimininate-unreachable");
361 
362   if (DeletedBlocks)
363     outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and "
364            << DeletedBytes << " bytes of code\n";
365   return Error::success();
366 }
367 
368 bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const {
369   return (BinaryFunctionPass::shouldPrint(BF) &&
370           opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE);
371 }
372 
373 bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction &BF) const {
374   // Apply execution count threshold
375   if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
376     return false;
377 
378   return BinaryFunctionPass::shouldOptimize(BF);
379 }
380 
381 Error ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
382   if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE)
383     return Error::success();
384 
385   std::atomic_uint64_t ModifiedFuncCount(0);
386   std::mutex FunctionEditDistanceMutex;
387   DenseMap<const BinaryFunction *, uint64_t> FunctionEditDistance;
388 
389   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
390     SmallVector<const BinaryBasicBlock *, 0> OldBlockOrder;
391     if (opts::PrintFuncStat > 0)
392       llvm::copy(BF.getLayout().blocks(), std::back_inserter(OldBlockOrder));
393 
394     const bool LayoutChanged =
395         modifyFunctionLayout(BF, opts::ReorderBlocks, opts::MinBranchClusters);
396     if (LayoutChanged) {
397       ModifiedFuncCount.fetch_add(1, std::memory_order_relaxed);
398       if (opts::PrintFuncStat > 0) {
399         const uint64_t Distance = BF.getLayout().getEditDistance(OldBlockOrder);
400         std::lock_guard<std::mutex> Lock(FunctionEditDistanceMutex);
401         FunctionEditDistance[&BF] = Distance;
402       }
403     }
404   };
405 
406   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
407     return !shouldOptimize(BF);
408   };
409 
410   ParallelUtilities::runOnEachFunction(
411       BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc,
412       "ReorderBasicBlocks");
413   const size_t NumAllProfiledFunctions =
414       BC.NumProfiledFuncs + BC.NumStaleProfileFuncs;
415 
416   outs() << "BOLT-INFO: basic block reordering modified layout of "
417          << format("%zu functions (%.2lf%% of profiled, %.2lf%% of total)\n",
418                    ModifiedFuncCount.load(std::memory_order_relaxed),
419                    100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) /
420                        NumAllProfiledFunctions,
421                    100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) /
422                        BC.getBinaryFunctions().size());
423 
424   if (opts::PrintFuncStat > 0) {
425     raw_ostream &OS = outs();
426     // Copy all the values into vector in order to sort them
427     std::map<uint64_t, BinaryFunction &> ScoreMap;
428     auto &BFs = BC.getBinaryFunctions();
429     for (auto It = BFs.begin(); It != BFs.end(); ++It)
430       ScoreMap.insert(std::pair<uint64_t, BinaryFunction &>(
431           It->second.getFunctionScore(), It->second));
432 
433     OS << "\nBOLT-INFO: Printing Function Statistics:\n\n";
434     OS << "           There are " << BFs.size() << " functions in total. \n";
435     OS << "           Number of functions being modified: "
436        << ModifiedFuncCount.load(std::memory_order_relaxed) << "\n";
437     OS << "           User asks for detailed information on top "
438        << opts::PrintFuncStat << " functions. (Ranked by function score)"
439        << "\n\n";
440     uint64_t I = 0;
441     for (std::map<uint64_t, BinaryFunction &>::reverse_iterator Rit =
442              ScoreMap.rbegin();
443          Rit != ScoreMap.rend() && I < opts::PrintFuncStat; ++Rit, ++I) {
444       BinaryFunction &Function = Rit->second;
445 
446       OS << "           Information for function of top: " << (I + 1) << ": \n";
447       OS << "             Function Score is: " << Function.getFunctionScore()
448          << "\n";
449       OS << "             There are " << Function.size()
450          << " number of blocks in this function.\n";
451       OS << "             There are " << Function.getInstructionCount()
452          << " number of instructions in this function.\n";
453       OS << "             The edit distance for this function is: "
454          << FunctionEditDistance.lookup(&Function) << "\n\n";
455     }
456   }
457   return Error::success();
458 }
459 
460 bool ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF,
461                                               LayoutType Type,
462                                               bool MinBranchClusters) const {
463   if (BF.size() == 0 || Type == LT_NONE)
464     return false;
465 
466   BinaryFunction::BasicBlockOrderType NewLayout;
467   std::unique_ptr<ReorderAlgorithm> Algo;
468 
469   // Cannot do optimal layout without profile.
470   if (Type != LT_REVERSE && !BF.hasValidProfile())
471     return false;
472 
473   if (Type == LT_REVERSE) {
474     Algo.reset(new ReverseReorderAlgorithm());
475   } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) {
476     // Work on optimal solution if problem is small enough
477     LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n");
478     Algo.reset(new TSPReorderAlgorithm());
479   } else {
480     LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n");
481 
482     std::unique_ptr<ClusterAlgorithm> CAlgo;
483     if (MinBranchClusters)
484       CAlgo.reset(new MinBranchGreedyClusterAlgorithm());
485     else
486       CAlgo.reset(new PHGreedyClusterAlgorithm());
487 
488     switch (Type) {
489     case LT_OPTIMIZE:
490       Algo.reset(new OptimizeReorderAlgorithm(std::move(CAlgo)));
491       break;
492 
493     case LT_OPTIMIZE_BRANCH:
494       Algo.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo)));
495       break;
496 
497     case LT_OPTIMIZE_CACHE:
498       Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo)));
499       break;
500 
501     case LT_OPTIMIZE_EXT_TSP:
502       Algo.reset(new ExtTSPReorderAlgorithm());
503       break;
504 
505     case LT_OPTIMIZE_SHUFFLE:
506       Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo)));
507       break;
508 
509     default:
510       llvm_unreachable("unexpected layout type");
511     }
512   }
513 
514   Algo->reorderBasicBlocks(BF, NewLayout);
515 
516   return BF.getLayout().update(NewLayout);
517 }
518 
519 Error FixupBranches::runOnFunctions(BinaryContext &BC) {
520   for (auto &It : BC.getBinaryFunctions()) {
521     BinaryFunction &Function = It.second;
522     if (!BC.shouldEmit(Function) || !Function.isSimple())
523       continue;
524 
525     Function.fixBranches();
526   }
527   return Error::success();
528 }
529 
530 Error FinalizeFunctions::runOnFunctions(BinaryContext &BC) {
531   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
532     if (!BF.finalizeCFIState()) {
533       if (BC.HasRelocations) {
534         errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF
535                << ". Exiting.\n";
536         exit(1);
537       }
538       BF.setSimple(false);
539       return;
540     }
541 
542     BF.setFinalized();
543 
544     // Update exception handling information.
545     BF.updateEHRanges();
546   };
547 
548   ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
549     return !BC.shouldEmit(BF);
550   };
551 
552   ParallelUtilities::runOnEachFunction(
553       BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun,
554       SkipPredicate, "FinalizeFunctions");
555   return Error::success();
556 }
557 
558 Error CheckLargeFunctions::runOnFunctions(BinaryContext &BC) {
559   if (BC.HasRelocations)
560     return Error::success();
561 
562   // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit
563   // incorrect meta data.
564   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
565     uint64_t HotSize, ColdSize;
566     std::tie(HotSize, ColdSize) =
567         BC.calculateEmittedSize(BF, /*FixBranches=*/false);
568     if (HotSize > BF.getMaxSize())
569       BF.setSimple(false);
570   };
571 
572   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
573     return !shouldOptimize(BF);
574   };
575 
576   ParallelUtilities::runOnEachFunction(
577       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
578       SkipFunc, "CheckLargeFunctions");
579 
580   return Error::success();
581 }
582 
583 bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const {
584   // Unlike other passes, allow functions in non-CFG state.
585   return BF.isSimple() && !BF.isIgnored();
586 }
587 
588 Error LowerAnnotations::runOnFunctions(BinaryContext &BC) {
589   // Convert GnuArgsSize annotations into CFIs.
590   for (BinaryFunction *BF : BC.getAllBinaryFunctions()) {
591     for (FunctionFragment &FF : BF->getLayout().fragments()) {
592       // Reset at the start of the new fragment.
593       int64_t CurrentGnuArgsSize = 0;
594 
595       for (BinaryBasicBlock *const BB : FF) {
596         for (auto II = BB->begin(); II != BB->end(); ++II) {
597           if (!BF->usesGnuArgsSize() || !BC.MIB->isInvoke(*II))
598             continue;
599 
600           const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(*II);
601           assert(NewGnuArgsSize >= 0 && "Expected non-negative GNU_args_size.");
602           if (NewGnuArgsSize == CurrentGnuArgsSize)
603             continue;
604 
605           auto InsertII = BF->addCFIInstruction(
606               BB, II,
607               MCCFIInstruction::createGnuArgsSize(nullptr, NewGnuArgsSize));
608           CurrentGnuArgsSize = NewGnuArgsSize;
609           II = std::next(InsertII);
610         }
611       }
612     }
613   }
614   return Error::success();
615 }
616 
617 // Check for dirty state in MCSymbol objects that might be a consequence
618 // of running calculateEmittedSize() in parallel, during split functions
619 // pass. If an inconsistent state is found (symbol already registered or
620 // already defined), clean it.
621 Error CleanMCState::runOnFunctions(BinaryContext &BC) {
622   MCContext &Ctx = *BC.Ctx;
623   for (const auto &SymMapEntry : Ctx.getSymbols()) {
624     const MCSymbol *S = SymMapEntry.second;
625     if (S->isDefined()) {
626       LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName()
627                         << "\" is already defined\n");
628       const_cast<MCSymbol *>(S)->setUndefined();
629     }
630     if (S->isRegistered()) {
631       LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName()
632                         << "\" is already registered\n");
633       const_cast<MCSymbol *>(S)->setIsRegistered(false);
634     }
635     LLVM_DEBUG(if (S->isVariable()) {
636       dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() << "\" is variable\n";
637     });
638   }
639   return Error::success();
640 }
641 
642 // This peephole fixes jump instructions that jump to another basic
643 // block with a single jump instruction, e.g.
644 //
645 // B0: ...
646 //     jmp  B1   (or jcc B1)
647 //
648 // B1: jmp  B2
649 //
650 // ->
651 //
652 // B0: ...
653 //     jmp  B2   (or jcc B2)
654 //
655 static uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) {
656   uint64_t NumDoubleJumps = 0;
657 
658   MCContext *Ctx = Function.getBinaryContext().Ctx.get();
659   MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
660   for (BinaryBasicBlock &BB : Function) {
661     auto checkAndPatch = [&](BinaryBasicBlock *Pred, BinaryBasicBlock *Succ,
662                              const MCSymbol *SuccSym) {
663       // Ignore infinite loop jumps or fallthrough tail jumps.
664       if (Pred == Succ || Succ == &BB)
665         return false;
666 
667       if (Succ) {
668         const MCSymbol *TBB = nullptr;
669         const MCSymbol *FBB = nullptr;
670         MCInst *CondBranch = nullptr;
671         MCInst *UncondBranch = nullptr;
672         bool Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
673         if (!Res) {
674           LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n";
675                      Pred->dump());
676           return false;
677         }
678         Pred->replaceSuccessor(&BB, Succ);
679 
680         // We must patch up any existing branch instructions to match up
681         // with the new successor.
682         assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) &&
683                "Predecessor block has inconsistent number of successors");
684         if (CondBranch && MIB->getTargetSymbol(*CondBranch) == BB.getLabel()) {
685           MIB->replaceBranchTarget(*CondBranch, Succ->getLabel(), Ctx);
686         } else if (UncondBranch &&
687                    MIB->getTargetSymbol(*UncondBranch) == BB.getLabel()) {
688           MIB->replaceBranchTarget(*UncondBranch, Succ->getLabel(), Ctx);
689         } else if (!UncondBranch) {
690           assert(Function.getLayout().getBasicBlockAfter(Pred, false) != Succ &&
691                  "Don't add an explicit jump to a fallthrough block.");
692           Pred->addBranchInstruction(Succ);
693         }
694       } else {
695         // Succ will be null in the tail call case.  In this case we
696         // need to explicitly add a tail call instruction.
697         MCInst *Branch = Pred->getLastNonPseudoInstr();
698         if (Branch && MIB->isUnconditionalBranch(*Branch)) {
699           assert(MIB->getTargetSymbol(*Branch) == BB.getLabel());
700           Pred->removeSuccessor(&BB);
701           Pred->eraseInstruction(Pred->findInstruction(Branch));
702           Pred->addTailCallInstruction(SuccSym);
703         } else {
704           return false;
705         }
706       }
707 
708       ++NumDoubleJumps;
709       LLVM_DEBUG(dbgs() << "Removed double jump in " << Function << " from "
710                         << Pred->getName() << " -> " << BB.getName() << " to "
711                         << Pred->getName() << " -> " << SuccSym->getName()
712                         << (!Succ ? " (tail)\n" : "\n"));
713 
714       return true;
715     };
716 
717     if (BB.getNumNonPseudos() != 1 || BB.isLandingPad())
718       continue;
719 
720     MCInst *Inst = BB.getFirstNonPseudoInstr();
721     const bool IsTailCall = MIB->isTailCall(*Inst);
722 
723     if (!MIB->isUnconditionalBranch(*Inst) && !IsTailCall)
724       continue;
725 
726     // If we operate after SCTC make sure it's not a conditional tail call.
727     if (IsTailCall && MIB->isConditionalBranch(*Inst))
728       continue;
729 
730     const MCSymbol *SuccSym = MIB->getTargetSymbol(*Inst);
731     BinaryBasicBlock *Succ = BB.getSuccessor();
732 
733     if (((!Succ || &BB == Succ) && !IsTailCall) || (IsTailCall && !SuccSym))
734       continue;
735 
736     std::vector<BinaryBasicBlock *> Preds = {BB.pred_begin(), BB.pred_end()};
737 
738     for (BinaryBasicBlock *Pred : Preds) {
739       if (Pred->isLandingPad())
740         continue;
741 
742       if (Pred->getSuccessor() == &BB ||
743           (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) ||
744           Pred->getConditionalSuccessor(false) == &BB)
745         if (checkAndPatch(Pred, Succ, SuccSym) && MarkInvalid)
746           BB.markValid(BB.pred_size() != 0 || BB.isLandingPad() ||
747                        BB.isEntryPoint());
748     }
749   }
750 
751   return NumDoubleJumps;
752 }
753 
754 bool SimplifyConditionalTailCalls::shouldRewriteBranch(
755     const BinaryBasicBlock *PredBB, const MCInst &CondBranch,
756     const BinaryBasicBlock *BB, const bool DirectionFlag) {
757   if (BeenOptimized.count(PredBB))
758     return false;
759 
760   const bool IsForward = BinaryFunction::isForwardBranch(PredBB, BB);
761 
762   if (IsForward)
763     ++NumOrigForwardBranches;
764   else
765     ++NumOrigBackwardBranches;
766 
767   if (opts::SctcMode == opts::SctcAlways)
768     return true;
769 
770   if (opts::SctcMode == opts::SctcPreserveDirection)
771     return IsForward == DirectionFlag;
772 
773   const ErrorOr<std::pair<double, double>> Frequency =
774       PredBB->getBranchStats(BB);
775 
776   // It's ok to rewrite the conditional branch if the new target will be
777   // a backward branch.
778 
779   // If no data available for these branches, then it should be ok to
780   // do the optimization since it will reduce code size.
781   if (Frequency.getError())
782     return true;
783 
784   // TODO: should this use misprediction frequency instead?
785   const bool Result = (IsForward && Frequency.get().first >= 0.5) ||
786                       (!IsForward && Frequency.get().first <= 0.5);
787 
788   return Result == DirectionFlag;
789 }
790 
791 uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) {
792   // Need updated indices to correctly detect branch' direction.
793   BF.getLayout().updateLayoutIndices();
794   BF.markUnreachableBlocks();
795 
796   MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get();
797   MCContext *Ctx = BF.getBinaryContext().Ctx.get();
798   uint64_t NumLocalCTCCandidates = 0;
799   uint64_t NumLocalCTCs = 0;
800   uint64_t LocalCTCTakenCount = 0;
801   uint64_t LocalCTCExecCount = 0;
802   std::vector<std::pair<BinaryBasicBlock *, const BinaryBasicBlock *>>
803       NeedsUncondBranch;
804 
805   // Will block be deleted by UCE?
806   auto isValid = [](const BinaryBasicBlock *BB) {
807     return (BB->pred_size() != 0 || BB->isLandingPad() || BB->isEntryPoint());
808   };
809 
810   for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
811     // Locate BB with a single direct tail-call instruction.
812     if (BB->getNumNonPseudos() != 1)
813       continue;
814 
815     MCInst *Instr = BB->getFirstNonPseudoInstr();
816     if (!MIB->isTailCall(*Instr) || MIB->isConditionalBranch(*Instr))
817       continue;
818 
819     const MCSymbol *CalleeSymbol = MIB->getTargetSymbol(*Instr);
820     if (!CalleeSymbol)
821       continue;
822 
823     // Detect direction of the possible conditional tail call.
824     const bool IsForwardCTC = BF.isForwardCall(CalleeSymbol);
825 
826     // Iterate through all predecessors.
827     for (BinaryBasicBlock *PredBB : BB->predecessors()) {
828       BinaryBasicBlock *CondSucc = PredBB->getConditionalSuccessor(true);
829       if (!CondSucc)
830         continue;
831 
832       ++NumLocalCTCCandidates;
833 
834       const MCSymbol *TBB = nullptr;
835       const MCSymbol *FBB = nullptr;
836       MCInst *CondBranch = nullptr;
837       MCInst *UncondBranch = nullptr;
838       bool Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
839 
840       // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz
841       if (!Result) {
842         LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n";
843                    PredBB->dump());
844         continue;
845       }
846 
847       assert(Result && "internal error analyzing conditional branch");
848       assert(CondBranch && "conditional branch expected");
849 
850       // It's possible that PredBB is also a successor to BB that may have
851       // been processed by a previous iteration of the SCTC loop, in which
852       // case it may have been marked invalid.  We should skip rewriting in
853       // this case.
854       if (!PredBB->isValid()) {
855         assert(PredBB->isSuccessor(BB) &&
856                "PredBB should be valid if it is not a successor to BB");
857         continue;
858       }
859 
860       // We don't want to reverse direction of the branch in new order
861       // without further profile analysis.
862       const bool DirectionFlag = CondSucc == BB ? IsForwardCTC : !IsForwardCTC;
863       if (!shouldRewriteBranch(PredBB, *CondBranch, BB, DirectionFlag))
864         continue;
865 
866       // Record this block so that we don't try to optimize it twice.
867       BeenOptimized.insert(PredBB);
868 
869       uint64_t Count = 0;
870       if (CondSucc != BB) {
871         // Patch the new target address into the conditional branch.
872         MIB->reverseBranchCondition(*CondBranch, CalleeSymbol, Ctx);
873         // Since we reversed the condition on the branch we need to change
874         // the target for the unconditional branch or add a unconditional
875         // branch to the old target.  This has to be done manually since
876         // fixupBranches is not called after SCTC.
877         NeedsUncondBranch.emplace_back(PredBB, CondSucc);
878         Count = PredBB->getFallthroughBranchInfo().Count;
879       } else {
880         // Change destination of the conditional branch.
881         MIB->replaceBranchTarget(*CondBranch, CalleeSymbol, Ctx);
882         Count = PredBB->getTakenBranchInfo().Count;
883       }
884       const uint64_t CTCTakenFreq =
885           Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count;
886 
887       // Annotate it, so "isCall" returns true for this jcc
888       MIB->setConditionalTailCall(*CondBranch);
889       // Add info about the conditional tail call frequency, otherwise this
890       // info will be lost when we delete the associated BranchInfo entry
891       auto &CTCAnnotation =
892           MIB->getOrCreateAnnotationAs<uint64_t>(*CondBranch, "CTCTakenCount");
893       CTCAnnotation = CTCTakenFreq;
894 
895       // Remove the unused successor which may be eliminated later
896       // if there are no other users.
897       PredBB->removeSuccessor(BB);
898       // Update BB execution count
899       if (CTCTakenFreq && CTCTakenFreq <= BB->getKnownExecutionCount())
900         BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq);
901       else if (CTCTakenFreq > BB->getKnownExecutionCount())
902         BB->setExecutionCount(0);
903 
904       ++NumLocalCTCs;
905       LocalCTCTakenCount += CTCTakenFreq;
906       LocalCTCExecCount += PredBB->getKnownExecutionCount();
907     }
908 
909     // Remove the block from CFG if all predecessors were removed.
910     BB->markValid(isValid(BB));
911   }
912 
913   // Add unconditional branches at the end of BBs to new successors
914   // as long as the successor is not a fallthrough.
915   for (auto &Entry : NeedsUncondBranch) {
916     BinaryBasicBlock *PredBB = Entry.first;
917     const BinaryBasicBlock *CondSucc = Entry.second;
918 
919     const MCSymbol *TBB = nullptr;
920     const MCSymbol *FBB = nullptr;
921     MCInst *CondBranch = nullptr;
922     MCInst *UncondBranch = nullptr;
923     PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
924 
925     // Find the next valid block.  Invalid blocks will be deleted
926     // so they shouldn't be considered fallthrough targets.
927     const BinaryBasicBlock *NextBlock =
928         BF.getLayout().getBasicBlockAfter(PredBB, false);
929     while (NextBlock && !isValid(NextBlock))
930       NextBlock = BF.getLayout().getBasicBlockAfter(NextBlock, false);
931 
932     // Get the unconditional successor to this block.
933     const BinaryBasicBlock *PredSucc = PredBB->getSuccessor();
934     assert(PredSucc && "The other branch should be a tail call");
935 
936     const bool HasFallthrough = (NextBlock && PredSucc == NextBlock);
937 
938     if (UncondBranch) {
939       if (HasFallthrough)
940         PredBB->eraseInstruction(PredBB->findInstruction(UncondBranch));
941       else
942         MIB->replaceBranchTarget(*UncondBranch, CondSucc->getLabel(), Ctx);
943     } else if (!HasFallthrough) {
944       MCInst Branch;
945       MIB->createUncondBranch(Branch, CondSucc->getLabel(), Ctx);
946       PredBB->addInstruction(Branch);
947     }
948   }
949 
950   if (NumLocalCTCs > 0) {
951     NumDoubleJumps += fixDoubleJumps(BF, true);
952     // Clean-up unreachable tail-call blocks.
953     const std::pair<unsigned, uint64_t> Stats = BF.eraseInvalidBBs();
954     DeletedBlocks += Stats.first;
955     DeletedBytes += Stats.second;
956 
957     assert(BF.validateCFG());
958   }
959 
960   LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs
961                     << " conditional tail calls from a total of "
962                     << NumLocalCTCCandidates << " candidates in function " << BF
963                     << ". CTCs execution count for this function is "
964                     << LocalCTCExecCount << " and CTC taken count is "
965                     << LocalCTCTakenCount << "\n";);
966 
967   NumTailCallsPatched += NumLocalCTCs;
968   NumCandidateTailCalls += NumLocalCTCCandidates;
969   CTCExecCount += LocalCTCExecCount;
970   CTCTakenCount += LocalCTCTakenCount;
971 
972   return NumLocalCTCs > 0;
973 }
974 
975 Error SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) {
976   if (!BC.isX86())
977     return Error::success();
978 
979   for (auto &It : BC.getBinaryFunctions()) {
980     BinaryFunction &Function = It.second;
981 
982     if (!shouldOptimize(Function))
983       continue;
984 
985     if (fixTailCalls(Function)) {
986       Modified.insert(&Function);
987       Function.setHasCanonicalCFG(false);
988     }
989   }
990 
991   if (NumTailCallsPatched)
992     outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched
993            << " tail calls (" << NumOrigForwardBranches << " forward)"
994            << " tail calls (" << NumOrigBackwardBranches << " backward)"
995            << " from a total of " << NumCandidateTailCalls << " while removing "
996            << NumDoubleJumps << " double jumps"
997            << " and removing " << DeletedBlocks << " basic blocks"
998            << " totalling " << DeletedBytes
999            << " bytes of code. CTCs total execution count is " << CTCExecCount
1000            << " and the number of times CTCs are taken is " << CTCTakenCount
1001            << "\n";
1002   return Error::success();
1003 }
1004 
1005 uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) {
1006   uint64_t Count = 0;
1007   const BinaryContext &BC = Function.getBinaryContext();
1008   for (BinaryBasicBlock &BB : Function) {
1009     for (MCInst &Inst : BB) {
1010       MCInst OriginalInst;
1011       if (opts::Verbosity > 2)
1012         OriginalInst = Inst;
1013 
1014       if (!BC.MIB->shortenInstruction(Inst, *BC.STI))
1015         continue;
1016 
1017       if (opts::Verbosity > 2) {
1018         BC.scopeLock();
1019         outs() << "BOLT-INFO: shortening:\nBOLT-INFO:    ";
1020         BC.printInstruction(outs(), OriginalInst, 0, &Function);
1021         outs() << "BOLT-INFO: to:";
1022         BC.printInstruction(outs(), Inst, 0, &Function);
1023       }
1024 
1025       ++Count;
1026     }
1027   }
1028 
1029   return Count;
1030 }
1031 
1032 Error ShortenInstructions::runOnFunctions(BinaryContext &BC) {
1033   std::atomic<uint64_t> NumShortened{0};
1034   if (!BC.isX86())
1035     return Error::success();
1036 
1037   ParallelUtilities::runOnEachFunction(
1038       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR,
1039       [&](BinaryFunction &BF) { NumShortened += shortenInstructions(BF); },
1040       nullptr, "ShortenInstructions");
1041 
1042   if (NumShortened)
1043     outs() << "BOLT-INFO: " << NumShortened << " instructions were shortened\n";
1044   return Error::success();
1045 }
1046 
1047 void Peepholes::addTailcallTraps(BinaryFunction &Function) {
1048   MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
1049   for (BinaryBasicBlock &BB : Function) {
1050     MCInst *Inst = BB.getLastNonPseudoInstr();
1051     if (Inst && MIB->isTailCall(*Inst) && MIB->isIndirectBranch(*Inst)) {
1052       MCInst Trap;
1053       if (MIB->createTrap(Trap)) {
1054         BB.addInstruction(Trap);
1055         ++TailCallTraps;
1056       }
1057     }
1058   }
1059 }
1060 
1061 void Peepholes::removeUselessCondBranches(BinaryFunction &Function) {
1062   for (BinaryBasicBlock &BB : Function) {
1063     if (BB.succ_size() != 2)
1064       continue;
1065 
1066     BinaryBasicBlock *CondBB = BB.getConditionalSuccessor(true);
1067     BinaryBasicBlock *UncondBB = BB.getConditionalSuccessor(false);
1068     if (CondBB != UncondBB)
1069       continue;
1070 
1071     const MCSymbol *TBB = nullptr;
1072     const MCSymbol *FBB = nullptr;
1073     MCInst *CondBranch = nullptr;
1074     MCInst *UncondBranch = nullptr;
1075     bool Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
1076 
1077     // analyzeBranch() can fail due to unusual branch instructions,
1078     // e.g. jrcxz, or jump tables (indirect jump).
1079     if (!Result || !CondBranch)
1080       continue;
1081 
1082     BB.removeDuplicateConditionalSuccessor(CondBranch);
1083     ++NumUselessCondBranches;
1084   }
1085 }
1086 
1087 Error Peepholes::runOnFunctions(BinaryContext &BC) {
1088   const char Opts =
1089       std::accumulate(opts::Peepholes.begin(), opts::Peepholes.end(), 0,
1090                       [](const char A, const PeepholeOpts B) { return A | B; });
1091   if (Opts == PEEP_NONE)
1092     return Error::success();
1093 
1094   for (auto &It : BC.getBinaryFunctions()) {
1095     BinaryFunction &Function = It.second;
1096     if (shouldOptimize(Function)) {
1097       if (Opts & PEEP_DOUBLE_JUMPS)
1098         NumDoubleJumps += fixDoubleJumps(Function, false);
1099       if (Opts & PEEP_TAILCALL_TRAPS)
1100         addTailcallTraps(Function);
1101       if (Opts & PEEP_USELESS_BRANCHES)
1102         removeUselessCondBranches(Function);
1103       assert(Function.validateCFG());
1104     }
1105   }
1106   outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps
1107          << " double jumps patched.\n"
1108          << "BOLT-INFO: Peephole: " << TailCallTraps
1109          << " tail call traps inserted.\n"
1110          << "BOLT-INFO: Peephole: " << NumUselessCondBranches
1111          << " useless conditional branches removed.\n";
1112   return Error::success();
1113 }
1114 
1115 bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) {
1116   BinaryContext &BC = BF.getBinaryContext();
1117   MCPlusBuilder *MIB = BC.MIB.get();
1118 
1119   uint64_t NumLocalLoadsSimplified = 0;
1120   uint64_t NumDynamicLocalLoadsSimplified = 0;
1121   uint64_t NumLocalLoadsFound = 0;
1122   uint64_t NumDynamicLocalLoadsFound = 0;
1123 
1124   for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
1125     for (MCInst &Inst : *BB) {
1126       unsigned Opcode = Inst.getOpcode();
1127       const MCInstrDesc &Desc = BC.MII->get(Opcode);
1128 
1129       // Skip instructions that do not load from memory.
1130       if (!Desc.mayLoad())
1131         continue;
1132 
1133       // Try to statically evaluate the target memory address;
1134       uint64_t TargetAddress;
1135 
1136       if (MIB->hasPCRelOperand(Inst)) {
1137         // Try to find the symbol that corresponds to the PC-relative operand.
1138         MCOperand *DispOpI = MIB->getMemOperandDisp(Inst);
1139         assert(DispOpI != Inst.end() && "expected PC-relative displacement");
1140         assert(DispOpI->isExpr() &&
1141                "found PC-relative with non-symbolic displacement");
1142 
1143         // Get displacement symbol.
1144         const MCSymbol *DisplSymbol;
1145         uint64_t DisplOffset;
1146 
1147         std::tie(DisplSymbol, DisplOffset) =
1148             MIB->getTargetSymbolInfo(DispOpI->getExpr());
1149 
1150         if (!DisplSymbol)
1151           continue;
1152 
1153         // Look up the symbol address in the global symbols map of the binary
1154         // context object.
1155         BinaryData *BD = BC.getBinaryDataByName(DisplSymbol->getName());
1156         if (!BD)
1157           continue;
1158         TargetAddress = BD->getAddress() + DisplOffset;
1159       } else if (!MIB->evaluateMemOperandTarget(Inst, TargetAddress)) {
1160         continue;
1161       }
1162 
1163       // Get the contents of the section containing the target address of the
1164       // memory operand. We are only interested in read-only sections.
1165       ErrorOr<BinarySection &> DataSection =
1166           BC.getSectionForAddress(TargetAddress);
1167       if (!DataSection || DataSection->isWritable())
1168         continue;
1169 
1170       if (BC.getRelocationAt(TargetAddress) ||
1171           BC.getDynamicRelocationAt(TargetAddress))
1172         continue;
1173 
1174       uint32_t Offset = TargetAddress - DataSection->getAddress();
1175       StringRef ConstantData = DataSection->getContents();
1176 
1177       ++NumLocalLoadsFound;
1178       if (BB->hasProfile())
1179         NumDynamicLocalLoadsFound += BB->getExecutionCount();
1180 
1181       if (MIB->replaceMemOperandWithImm(Inst, ConstantData, Offset)) {
1182         ++NumLocalLoadsSimplified;
1183         if (BB->hasProfile())
1184           NumDynamicLocalLoadsSimplified += BB->getExecutionCount();
1185       }
1186     }
1187   }
1188 
1189   NumLoadsFound += NumLocalLoadsFound;
1190   NumDynamicLoadsFound += NumDynamicLocalLoadsFound;
1191   NumLoadsSimplified += NumLocalLoadsSimplified;
1192   NumDynamicLoadsSimplified += NumDynamicLocalLoadsSimplified;
1193 
1194   return NumLocalLoadsSimplified > 0;
1195 }
1196 
1197 Error SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) {
1198   for (auto &It : BC.getBinaryFunctions()) {
1199     BinaryFunction &Function = It.second;
1200     if (shouldOptimize(Function) && simplifyRODataLoads(Function))
1201       Modified.insert(&Function);
1202   }
1203 
1204   outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of "
1205          << NumLoadsFound << " loads from a statically computed address.\n"
1206          << "BOLT-INFO: dynamic loads simplified: " << NumDynamicLoadsSimplified
1207          << "\n"
1208          << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound << "\n";
1209   return Error::success();
1210 }
1211 
1212 Error AssignSections::runOnFunctions(BinaryContext &BC) {
1213   for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) {
1214     Function->setCodeSectionName(BC.getInjectedCodeSectionName());
1215     Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName());
1216   }
1217 
1218   // In non-relocation mode functions have pre-assigned section names.
1219   if (!BC.HasRelocations)
1220     return Error::success();
1221 
1222   const bool UseColdSection =
1223       BC.NumProfiledFuncs > 0 ||
1224       opts::ReorderFunctions == ReorderFunctions::RT_USER;
1225   for (auto &BFI : BC.getBinaryFunctions()) {
1226     BinaryFunction &Function = BFI.second;
1227     if (opts::isHotTextMover(Function)) {
1228       Function.setCodeSectionName(BC.getHotTextMoverSectionName());
1229       Function.setColdCodeSectionName(BC.getHotTextMoverSectionName());
1230       continue;
1231     }
1232 
1233     if (!UseColdSection || Function.hasValidIndex())
1234       Function.setCodeSectionName(BC.getMainCodeSectionName());
1235     else
1236       Function.setCodeSectionName(BC.getColdCodeSectionName());
1237 
1238     if (Function.isSplit())
1239       Function.setColdCodeSectionName(BC.getColdCodeSectionName());
1240   }
1241   return Error::success();
1242 }
1243 
1244 Error PrintProfileStats::runOnFunctions(BinaryContext &BC) {
1245   double FlowImbalanceMean = 0.0;
1246   size_t NumBlocksConsidered = 0;
1247   double WorstBias = 0.0;
1248   const BinaryFunction *WorstBiasFunc = nullptr;
1249 
1250   // For each function CFG, we fill an IncomingMap with the sum of the frequency
1251   // of incoming edges for each BB. Likewise for each OutgoingMap and the sum
1252   // of the frequency of outgoing edges.
1253   using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>;
1254   std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps;
1255   std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps;
1256 
1257   // Compute mean
1258   for (const auto &BFI : BC.getBinaryFunctions()) {
1259     const BinaryFunction &Function = BFI.second;
1260     if (Function.empty() || !Function.isSimple())
1261       continue;
1262     FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
1263     FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
1264     for (const BinaryBasicBlock &BB : Function) {
1265       uint64_t TotalOutgoing = 0ULL;
1266       auto SuccBIIter = BB.branch_info_begin();
1267       for (BinaryBasicBlock *Succ : BB.successors()) {
1268         uint64_t Count = SuccBIIter->Count;
1269         if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) {
1270           ++SuccBIIter;
1271           continue;
1272         }
1273         TotalOutgoing += Count;
1274         IncomingMap[Succ] += Count;
1275         ++SuccBIIter;
1276       }
1277       OutgoingMap[&BB] = TotalOutgoing;
1278     }
1279 
1280     size_t NumBlocks = 0;
1281     double Mean = 0.0;
1282     for (const BinaryBasicBlock &BB : Function) {
1283       // Do not compute score for low frequency blocks, entry or exit blocks
1284       if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint())
1285         continue;
1286       ++NumBlocks;
1287       const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
1288       Mean += fabs(Difference / IncomingMap[&BB]);
1289     }
1290 
1291     FlowImbalanceMean += Mean;
1292     NumBlocksConsidered += NumBlocks;
1293     if (!NumBlocks)
1294       continue;
1295     double FuncMean = Mean / NumBlocks;
1296     if (FuncMean > WorstBias) {
1297       WorstBias = FuncMean;
1298       WorstBiasFunc = &Function;
1299     }
1300   }
1301   if (NumBlocksConsidered > 0)
1302     FlowImbalanceMean /= NumBlocksConsidered;
1303 
1304   // Compute standard deviation
1305   NumBlocksConsidered = 0;
1306   double FlowImbalanceVar = 0.0;
1307   for (const auto &BFI : BC.getBinaryFunctions()) {
1308     const BinaryFunction &Function = BFI.second;
1309     if (Function.empty() || !Function.isSimple())
1310       continue;
1311     FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
1312     FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
1313     for (const BinaryBasicBlock &BB : Function) {
1314       if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0)
1315         continue;
1316       ++NumBlocksConsidered;
1317       const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
1318       FlowImbalanceVar +=
1319           pow(fabs(Difference / IncomingMap[&BB]) - FlowImbalanceMean, 2);
1320     }
1321   }
1322   if (NumBlocksConsidered) {
1323     FlowImbalanceVar /= NumBlocksConsidered;
1324     FlowImbalanceVar = sqrt(FlowImbalanceVar);
1325   }
1326 
1327   // Report to user
1328   outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n",
1329                    (100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar));
1330   if (WorstBiasFunc && opts::Verbosity >= 1) {
1331     outs() << "Worst average bias observed in " << WorstBiasFunc->getPrintName()
1332            << "\n";
1333     LLVM_DEBUG(WorstBiasFunc->dump());
1334   }
1335   return Error::success();
1336 }
1337 
1338 Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
1339   uint64_t NumRegularFunctions = 0;
1340   uint64_t NumStaleProfileFunctions = 0;
1341   uint64_t NumAllStaleFunctions = 0;
1342   uint64_t NumInferredFunctions = 0;
1343   uint64_t NumNonSimpleProfiledFunctions = 0;
1344   uint64_t NumUnknownControlFlowFunctions = 0;
1345   uint64_t TotalSampleCount = 0;
1346   uint64_t StaleSampleCount = 0;
1347   uint64_t InferredSampleCount = 0;
1348   std::vector<const BinaryFunction *> ProfiledFunctions;
1349   const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n";
1350   for (auto &BFI : BC.getBinaryFunctions()) {
1351     const BinaryFunction &Function = BFI.second;
1352 
1353     // Ignore PLT functions for stats.
1354     if (Function.isPLTFunction())
1355       continue;
1356 
1357     ++NumRegularFunctions;
1358 
1359     if (!Function.isSimple()) {
1360       if (Function.hasProfile())
1361         ++NumNonSimpleProfiledFunctions;
1362       continue;
1363     }
1364 
1365     if (Function.hasUnknownControlFlow()) {
1366       if (opts::PrintUnknownCFG)
1367         Function.dump();
1368       else if (opts::PrintUnknown)
1369         errs() << "function with unknown control flow: " << Function << '\n';
1370 
1371       ++NumUnknownControlFlowFunctions;
1372     }
1373 
1374     if (!Function.hasProfile())
1375       continue;
1376 
1377     uint64_t SampleCount = Function.getRawBranchCount();
1378     TotalSampleCount += SampleCount;
1379 
1380     if (Function.hasValidProfile()) {
1381       ProfiledFunctions.push_back(&Function);
1382       if (Function.hasInferredProfile()) {
1383         ++NumInferredFunctions;
1384         InferredSampleCount += SampleCount;
1385         ++NumAllStaleFunctions;
1386       }
1387     } else {
1388       if (opts::ReportStaleFuncs) {
1389         outs() << StaleFuncsHeader;
1390         StaleFuncsHeader = "";
1391         outs() << "  " << Function << '\n';
1392       }
1393       ++NumStaleProfileFunctions;
1394       StaleSampleCount += SampleCount;
1395       ++NumAllStaleFunctions;
1396     }
1397   }
1398   BC.NumProfiledFuncs = ProfiledFunctions.size();
1399   BC.NumStaleProfileFuncs = NumStaleProfileFunctions;
1400 
1401   const size_t NumAllProfiledFunctions =
1402       ProfiledFunctions.size() + NumStaleProfileFunctions;
1403   outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of "
1404          << NumRegularFunctions << " functions in the binary ("
1405          << format("%.1f", NumAllProfiledFunctions /
1406                                (float)NumRegularFunctions * 100.0f)
1407          << "%) have non-empty execution profile\n";
1408   if (NumNonSimpleProfiledFunctions) {
1409     outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function"
1410            << (NumNonSimpleProfiledFunctions == 1 ? "" : "s")
1411            << " with profile could not be optimized\n";
1412   }
1413   if (NumAllStaleFunctions) {
1414     const float PctStale =
1415         NumAllStaleFunctions / (float)NumAllProfiledFunctions * 100.0f;
1416     const float PctStaleFuncsWithEqualBlockCount =
1417         (float)BC.Stats.NumStaleFuncsWithEqualBlockCount /
1418         NumAllStaleFunctions * 100.0f;
1419     const float PctStaleBlocksWithEqualIcount =
1420         (float)BC.Stats.NumStaleBlocksWithEqualIcount /
1421         BC.Stats.NumStaleBlocks * 100.0f;
1422     auto printErrorOrWarning = [&]() {
1423       if (PctStale > opts::StaleThreshold)
1424         errs() << "BOLT-ERROR: ";
1425       else
1426         errs() << "BOLT-WARNING: ";
1427     };
1428     printErrorOrWarning();
1429     errs() << NumAllStaleFunctions
1430            << format(" (%.1f%% of all profiled)", PctStale) << " function"
1431            << (NumAllStaleFunctions == 1 ? "" : "s")
1432            << " have invalid (possibly stale) profile."
1433               " Use -report-stale to see the list.\n";
1434     if (TotalSampleCount > 0) {
1435       printErrorOrWarning();
1436       errs() << (StaleSampleCount + InferredSampleCount) << " out of "
1437              << TotalSampleCount << " samples in the binary ("
1438              << format("%.1f",
1439                        ((100.0f * (StaleSampleCount + InferredSampleCount)) /
1440                         TotalSampleCount))
1441              << "%) belong to functions with invalid"
1442                 " (possibly stale) profile.\n";
1443     }
1444     outs() << "BOLT-INFO: " << BC.Stats.NumStaleFuncsWithEqualBlockCount
1445            << " stale function"
1446            << (BC.Stats.NumStaleFuncsWithEqualBlockCount == 1 ? "" : "s")
1447            << format(" (%.1f%% of all stale)", PctStaleFuncsWithEqualBlockCount)
1448            << " have matching block count.\n";
1449     outs() << "BOLT-INFO: " << BC.Stats.NumStaleBlocksWithEqualIcount
1450            << " stale block"
1451            << (BC.Stats.NumStaleBlocksWithEqualIcount == 1 ? "" : "s")
1452            << format(" (%.1f%% of all stale)", PctStaleBlocksWithEqualIcount)
1453            << " have matching icount.\n";
1454     if (PctStale > opts::StaleThreshold) {
1455       errs() << "BOLT-ERROR: stale functions exceed specified threshold of "
1456              << opts::StaleThreshold << "%. Exiting.\n";
1457       exit(1);
1458     }
1459   }
1460   if (NumInferredFunctions) {
1461     outs() << format("BOLT-INFO: inferred profile for %d (%.2f%% of profiled, "
1462                      "%.2f%% of stale) functions responsible for %.2f%% samples"
1463                      " (%zu out of %zu)\n",
1464                      NumInferredFunctions,
1465                      100.0 * NumInferredFunctions / NumAllProfiledFunctions,
1466                      100.0 * NumInferredFunctions / NumAllStaleFunctions,
1467                      100.0 * InferredSampleCount / TotalSampleCount,
1468                      InferredSampleCount, TotalSampleCount);
1469     outs() << format(
1470         "BOLT-INFO: inference found an exact match for %.2f%% of basic blocks"
1471         " (%zu out of %zu stale) responsible for %.2f%% samples"
1472         " (%zu out of %zu stale)\n",
1473         100.0 * BC.Stats.NumMatchedBlocks / BC.Stats.NumStaleBlocks,
1474         BC.Stats.NumMatchedBlocks, BC.Stats.NumStaleBlocks,
1475         100.0 * BC.Stats.MatchedSampleCount / BC.Stats.StaleSampleCount,
1476         BC.Stats.MatchedSampleCount, BC.Stats.StaleSampleCount);
1477   }
1478 
1479   if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) {
1480     outs() << "BOLT-INFO: profile for " << NumUnusedObjects
1481            << " objects was ignored\n";
1482   }
1483 
1484   if (ProfiledFunctions.size() > 10) {
1485     if (opts::Verbosity >= 1) {
1486       outs() << "BOLT-INFO: top called functions are:\n";
1487       llvm::sort(ProfiledFunctions,
1488                  [](const BinaryFunction *A, const BinaryFunction *B) {
1489                    return B->getExecutionCount() < A->getExecutionCount();
1490                  });
1491       auto SFI = ProfiledFunctions.begin();
1492       auto SFIend = ProfiledFunctions.end();
1493       for (unsigned I = 0u; I < opts::TopCalledLimit && SFI != SFIend;
1494            ++SFI, ++I)
1495         outs() << "  " << **SFI << " : " << (*SFI)->getExecutionCount() << '\n';
1496     }
1497   }
1498 
1499   if (!opts::PrintSortedBy.empty()) {
1500     std::vector<BinaryFunction *> Functions;
1501     std::map<const BinaryFunction *, DynoStats> Stats;
1502 
1503     for (auto &BFI : BC.getBinaryFunctions()) {
1504       BinaryFunction &BF = BFI.second;
1505       if (shouldOptimize(BF) && BF.hasValidProfile()) {
1506         Functions.push_back(&BF);
1507         Stats.emplace(&BF, getDynoStats(BF));
1508       }
1509     }
1510 
1511     const bool SortAll =
1512         llvm::is_contained(opts::PrintSortedBy, DynoStats::LAST_DYNO_STAT);
1513 
1514     const bool Ascending =
1515         opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending;
1516 
1517     if (SortAll) {
1518       llvm::stable_sort(Functions,
1519                         [Ascending, &Stats](const BinaryFunction *A,
1520                                             const BinaryFunction *B) {
1521                           return Ascending ? Stats.at(A) < Stats.at(B)
1522                                            : Stats.at(B) < Stats.at(A);
1523                         });
1524     } else {
1525       llvm::stable_sort(
1526           Functions, [Ascending, &Stats](const BinaryFunction *A,
1527                                          const BinaryFunction *B) {
1528             const DynoStats &StatsA = Stats.at(A);
1529             const DynoStats &StatsB = Stats.at(B);
1530             return Ascending ? StatsA.lessThan(StatsB, opts::PrintSortedBy)
1531                              : StatsB.lessThan(StatsA, opts::PrintSortedBy);
1532           });
1533     }
1534 
1535     outs() << "BOLT-INFO: top functions sorted by ";
1536     if (SortAll) {
1537       outs() << "dyno stats";
1538     } else {
1539       outs() << "(";
1540       bool PrintComma = false;
1541       for (const DynoStats::Category Category : opts::PrintSortedBy) {
1542         if (PrintComma)
1543           outs() << ", ";
1544         outs() << DynoStats::Description(Category);
1545         PrintComma = true;
1546       }
1547       outs() << ")";
1548     }
1549 
1550     outs() << " are:\n";
1551     auto SFI = Functions.begin();
1552     for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) {
1553       const DynoStats Stats = getDynoStats(**SFI);
1554       outs() << "  " << **SFI;
1555       if (!SortAll) {
1556         outs() << " (";
1557         bool PrintComma = false;
1558         for (const DynoStats::Category Category : opts::PrintSortedBy) {
1559           if (PrintComma)
1560             outs() << ", ";
1561           outs() << dynoStatsOptName(Category) << "=" << Stats[Category];
1562           PrintComma = true;
1563         }
1564         outs() << ")";
1565       }
1566       outs() << "\n";
1567     }
1568   }
1569 
1570   if (!BC.TrappedFunctions.empty()) {
1571     errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function"
1572            << (BC.TrappedFunctions.size() > 1 ? "s" : "")
1573            << " will trap on entry. Use -trap-avx512=0 to disable"
1574               " traps.";
1575     if (opts::Verbosity >= 1 || BC.TrappedFunctions.size() <= 5) {
1576       errs() << '\n';
1577       for (const BinaryFunction *Function : BC.TrappedFunctions)
1578         errs() << "  " << *Function << '\n';
1579     } else {
1580       errs() << " Use -v=1 to see the list.\n";
1581     }
1582   }
1583 
1584   // Print information on missed macro-fusion opportunities seen on input.
1585   if (BC.Stats.MissedMacroFusionPairs) {
1586     outs() << format("BOLT-INFO: the input contains %zu (dynamic count : %zu)"
1587                      " opportunities for macro-fusion optimization",
1588                      BC.Stats.MissedMacroFusionPairs,
1589                      BC.Stats.MissedMacroFusionExecCount);
1590     switch (opts::AlignMacroOpFusion) {
1591     case MFT_NONE:
1592       outs() << ". Use -align-macro-fusion to fix.\n";
1593       break;
1594     case MFT_HOT:
1595       outs() << ". Will fix instances on a hot path.\n";
1596       break;
1597     case MFT_ALL:
1598       outs() << " that are going to be fixed\n";
1599       break;
1600     }
1601   }
1602 
1603   // Collect and print information about suboptimal code layout on input.
1604   if (opts::ReportBadLayout) {
1605     std::vector<BinaryFunction *> SuboptimalFuncs;
1606     for (auto &BFI : BC.getBinaryFunctions()) {
1607       BinaryFunction &BF = BFI.second;
1608       if (!BF.hasValidProfile())
1609         continue;
1610 
1611       const uint64_t HotThreshold =
1612           std::max<uint64_t>(BF.getKnownExecutionCount(), 1);
1613       bool HotSeen = false;
1614       for (const BinaryBasicBlock *BB : BF.getLayout().rblocks()) {
1615         if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) {
1616           HotSeen = true;
1617           continue;
1618         }
1619         if (HotSeen && BB->getKnownExecutionCount() == 0) {
1620           SuboptimalFuncs.push_back(&BF);
1621           break;
1622         }
1623       }
1624     }
1625 
1626     if (!SuboptimalFuncs.empty()) {
1627       llvm::sort(SuboptimalFuncs,
1628                  [](const BinaryFunction *A, const BinaryFunction *B) {
1629                    return A->getKnownExecutionCount() / A->getSize() >
1630                           B->getKnownExecutionCount() / B->getSize();
1631                  });
1632 
1633       outs() << "BOLT-INFO: " << SuboptimalFuncs.size()
1634              << " functions have "
1635                 "cold code in the middle of hot code. Top functions are:\n";
1636       for (unsigned I = 0;
1637            I < std::min(static_cast<size_t>(opts::ReportBadLayout),
1638                         SuboptimalFuncs.size());
1639            ++I)
1640         SuboptimalFuncs[I]->print(outs());
1641     }
1642   }
1643 
1644   if (NumUnknownControlFlowFunctions) {
1645     outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions
1646            << " functions have instructions with unknown control flow";
1647     if (!opts::PrintUnknown)
1648       outs() << ". Use -print-unknown to see the list.";
1649     outs() << '\n';
1650   }
1651   return Error::success();
1652 }
1653 
1654 Error InstructionLowering::runOnFunctions(BinaryContext &BC) {
1655   for (auto &BFI : BC.getBinaryFunctions())
1656     for (BinaryBasicBlock &BB : BFI.second)
1657       for (MCInst &Instruction : BB)
1658         BC.MIB->lowerTailCall(Instruction);
1659   return Error::success();
1660 }
1661 
1662 Error StripRepRet::runOnFunctions(BinaryContext &BC) {
1663   if (!BC.isX86())
1664     return Error::success();
1665 
1666   uint64_t NumPrefixesRemoved = 0;
1667   uint64_t NumBytesSaved = 0;
1668   for (auto &BFI : BC.getBinaryFunctions()) {
1669     for (BinaryBasicBlock &BB : BFI.second) {
1670       auto LastInstRIter = BB.getLastNonPseudo();
1671       if (LastInstRIter == BB.rend() || !BC.MIB->isReturn(*LastInstRIter) ||
1672           !BC.MIB->deleteREPPrefix(*LastInstRIter))
1673         continue;
1674 
1675       NumPrefixesRemoved += BB.getKnownExecutionCount();
1676       ++NumBytesSaved;
1677     }
1678   }
1679 
1680   if (NumBytesSaved)
1681     outs() << "BOLT-INFO: removed " << NumBytesSaved
1682            << " 'repz' prefixes"
1683               " with estimated execution count of "
1684            << NumPrefixesRemoved << " times.\n";
1685   return Error::success();
1686 }
1687 
1688 Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
1689   if (!BC.isX86())
1690     return Error::success();
1691 
1692   uint64_t NumInlined = 0;
1693   uint64_t NumInlinedDyno = 0;
1694   for (auto &BFI : BC.getBinaryFunctions()) {
1695     for (BinaryBasicBlock &BB : BFI.second) {
1696       for (auto II = BB.begin(); II != BB.end(); ++II) {
1697         MCInst &Inst = *II;
1698 
1699         if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
1700             !Inst.getOperand(0).isExpr())
1701           continue;
1702 
1703         const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
1704         if (CalleeSymbol->getName() != "memcpy" &&
1705             CalleeSymbol->getName() != "memcpy@PLT" &&
1706             CalleeSymbol->getName() != "_memcpy8")
1707           continue;
1708 
1709         const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
1710         const bool IsTailCall = BC.MIB->isTailCall(Inst);
1711 
1712         const InstructionListType NewCode =
1713             BC.MIB->createInlineMemcpy(IsMemcpy8);
1714         II = BB.replaceInstruction(II, NewCode);
1715         std::advance(II, NewCode.size() - 1);
1716         if (IsTailCall) {
1717           MCInst Return;
1718           BC.MIB->createReturn(Return);
1719           II = BB.insertInstruction(std::next(II), std::move(Return));
1720         }
1721 
1722         ++NumInlined;
1723         NumInlinedDyno += BB.getKnownExecutionCount();
1724       }
1725     }
1726   }
1727 
1728   if (NumInlined) {
1729     outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls";
1730     if (NumInlinedDyno)
1731       outs() << ". The calls were executed " << NumInlinedDyno
1732              << " times based on profile.";
1733     outs() << '\n';
1734   }
1735   return Error::success();
1736 }
1737 
1738 bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const {
1739   if (!BinaryFunctionPass::shouldOptimize(Function))
1740     return false;
1741 
1742   for (const std::string &FunctionSpec : Spec) {
1743     StringRef FunctionName = StringRef(FunctionSpec).split(':').first;
1744     if (Function.hasNameRegex(FunctionName))
1745       return true;
1746   }
1747 
1748   return false;
1749 }
1750 
1751 std::set<size_t> SpecializeMemcpy1::getCallSitesToOptimize(
1752     const BinaryFunction &Function) const {
1753   StringRef SitesString;
1754   for (const std::string &FunctionSpec : Spec) {
1755     StringRef FunctionName;
1756     std::tie(FunctionName, SitesString) = StringRef(FunctionSpec).split(':');
1757     if (Function.hasNameRegex(FunctionName))
1758       break;
1759     SitesString = "";
1760   }
1761 
1762   std::set<size_t> Sites;
1763   SmallVector<StringRef, 4> SitesVec;
1764   SitesString.split(SitesVec, ':');
1765   for (StringRef SiteString : SitesVec) {
1766     if (SiteString.empty())
1767       continue;
1768     size_t Result;
1769     if (!SiteString.getAsInteger(10, Result))
1770       Sites.emplace(Result);
1771   }
1772 
1773   return Sites;
1774 }
1775 
1776 Error SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) {
1777   if (!BC.isX86())
1778     return Error::success();
1779 
1780   uint64_t NumSpecialized = 0;
1781   uint64_t NumSpecializedDyno = 0;
1782   for (auto &BFI : BC.getBinaryFunctions()) {
1783     BinaryFunction &Function = BFI.second;
1784     if (!shouldOptimize(Function))
1785       continue;
1786 
1787     std::set<size_t> CallsToOptimize = getCallSitesToOptimize(Function);
1788     auto shouldOptimize = [&](size_t N) {
1789       return CallsToOptimize.empty() || CallsToOptimize.count(N);
1790     };
1791 
1792     std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend());
1793     size_t CallSiteID = 0;
1794     for (BinaryBasicBlock *CurBB : Blocks) {
1795       for (auto II = CurBB->begin(); II != CurBB->end(); ++II) {
1796         MCInst &Inst = *II;
1797 
1798         if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
1799             !Inst.getOperand(0).isExpr())
1800           continue;
1801 
1802         const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
1803         if (CalleeSymbol->getName() != "memcpy" &&
1804             CalleeSymbol->getName() != "memcpy@PLT")
1805           continue;
1806 
1807         if (BC.MIB->isTailCall(Inst))
1808           continue;
1809 
1810         ++CallSiteID;
1811 
1812         if (!shouldOptimize(CallSiteID))
1813           continue;
1814 
1815         // Create a copy of a call to memcpy(dest, src, size).
1816         MCInst MemcpyInstr = Inst;
1817 
1818         BinaryBasicBlock *OneByteMemcpyBB = CurBB->splitAt(II);
1819 
1820         BinaryBasicBlock *NextBB = nullptr;
1821         if (OneByteMemcpyBB->getNumNonPseudos() > 1) {
1822           NextBB = OneByteMemcpyBB->splitAt(OneByteMemcpyBB->begin());
1823           NextBB->eraseInstruction(NextBB->begin());
1824         } else {
1825           NextBB = OneByteMemcpyBB->getSuccessor();
1826           OneByteMemcpyBB->eraseInstruction(OneByteMemcpyBB->begin());
1827           assert(NextBB && "unexpected call to memcpy() with no return");
1828         }
1829 
1830         BinaryBasicBlock *MemcpyBB = Function.addBasicBlock();
1831         MemcpyBB->setOffset(CurBB->getInputOffset());
1832         InstructionListType CmpJCC =
1833             BC.MIB->createCmpJE(BC.MIB->getIntArgRegister(2), 1,
1834                                 OneByteMemcpyBB->getLabel(), BC.Ctx.get());
1835         CurBB->addInstructions(CmpJCC);
1836         CurBB->addSuccessor(MemcpyBB);
1837 
1838         MemcpyBB->addInstruction(std::move(MemcpyInstr));
1839         MemcpyBB->addSuccessor(NextBB);
1840         MemcpyBB->setCFIState(NextBB->getCFIState());
1841         MemcpyBB->setExecutionCount(0);
1842 
1843         // To prevent the actual call from being moved to cold, we set its
1844         // execution count to 1.
1845         if (CurBB->getKnownExecutionCount() > 0)
1846           MemcpyBB->setExecutionCount(1);
1847 
1848         InstructionListType OneByteMemcpy = BC.MIB->createOneByteMemcpy();
1849         OneByteMemcpyBB->addInstructions(OneByteMemcpy);
1850 
1851         ++NumSpecialized;
1852         NumSpecializedDyno += CurBB->getKnownExecutionCount();
1853 
1854         CurBB = NextBB;
1855 
1856         // Note: we don't expect the next instruction to be a call to memcpy.
1857         II = CurBB->begin();
1858       }
1859     }
1860   }
1861 
1862   if (NumSpecialized) {
1863     outs() << "BOLT-INFO: specialized " << NumSpecialized
1864            << " memcpy() call sites for size 1";
1865     if (NumSpecializedDyno)
1866       outs() << ". The calls were executed " << NumSpecializedDyno
1867              << " times based on profile.";
1868     outs() << '\n';
1869   }
1870   return Error::success();
1871 }
1872 
1873 void RemoveNops::runOnFunction(BinaryFunction &BF) {
1874   const BinaryContext &BC = BF.getBinaryContext();
1875   for (BinaryBasicBlock &BB : BF) {
1876     for (int64_t I = BB.size() - 1; I >= 0; --I) {
1877       MCInst &Inst = BB.getInstructionAtIndex(I);
1878       if (BC.MIB->isNoop(Inst) && BC.MIB->hasAnnotation(Inst, "NOP"))
1879         BB.eraseInstructionAtIndex(I);
1880     }
1881   }
1882 }
1883 
1884 Error RemoveNops::runOnFunctions(BinaryContext &BC) {
1885   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
1886     runOnFunction(BF);
1887   };
1888 
1889   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
1890     return BF.shouldPreserveNops();
1891   };
1892 
1893   ParallelUtilities::runOnEachFunction(
1894       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
1895       SkipFunc, "RemoveNops");
1896   return Error::success();
1897 }
1898 
1899 } // namespace bolt
1900 } // namespace llvm
1901