xref: /llvm-project/bolt/lib/Passes/BinaryPasses.cpp (revision e28c393bd1c1f22f229d156b0b8283746cdac322)
1 //===- bolt/Passes/BinaryPasses.cpp - Binary-level passes -----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements multiple passes for binary optimization and analysis.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "bolt/Passes/BinaryPasses.h"
14 #include "bolt/Core/FunctionLayout.h"
15 #include "bolt/Core/ParallelUtilities.h"
16 #include "bolt/Passes/ReorderAlgorithm.h"
17 #include "bolt/Passes/ReorderFunctions.h"
18 #include "llvm/Support/CommandLine.h"
19 #include <atomic>
20 #include <mutex>
21 #include <numeric>
22 #include <vector>
23 
24 #define DEBUG_TYPE "bolt-opts"
25 
26 using namespace llvm;
27 using namespace bolt;
28 
29 static const char *dynoStatsOptName(const bolt::DynoStats::Category C) {
30   assert(C > bolt::DynoStats::FIRST_DYNO_STAT &&
31          C < DynoStats::LAST_DYNO_STAT && "Unexpected dyno stat category.");
32 
33   static std::string OptNames[bolt::DynoStats::LAST_DYNO_STAT + 1];
34 
35   OptNames[C] = bolt::DynoStats::Description(C);
36 
37   std::replace(OptNames[C].begin(), OptNames[C].end(), ' ', '-');
38 
39   return OptNames[C].c_str();
40 }
41 
42 namespace opts {
43 
44 extern cl::OptionCategory BoltCategory;
45 extern cl::OptionCategory BoltOptCategory;
46 
47 extern cl::opt<bolt::MacroFusionType> AlignMacroOpFusion;
48 extern cl::opt<unsigned> Verbosity;
49 extern cl::opt<bool> EnableBAT;
50 extern cl::opt<unsigned> ExecutionCountThreshold;
51 extern cl::opt<bool> UpdateDebugSections;
52 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions;
53 
54 enum DynoStatsSortOrder : char {
55   Ascending,
56   Descending
57 };
58 
59 static cl::opt<DynoStatsSortOrder> DynoStatsSortOrderOpt(
60     "print-sorted-by-order",
61     cl::desc("use ascending or descending order when printing functions "
62              "ordered by dyno stats"),
63     cl::init(DynoStatsSortOrder::Descending), cl::cat(BoltOptCategory));
64 
65 cl::list<std::string>
66 HotTextMoveSections("hot-text-move-sections",
67   cl::desc("list of sections containing functions used for hugifying hot text. "
68            "BOLT makes sure these functions are not placed on the same page as "
69            "the hot text. (default=\'.stub,.mover\')."),
70   cl::value_desc("sec1,sec2,sec3,..."),
71   cl::CommaSeparated,
72   cl::ZeroOrMore,
73   cl::cat(BoltCategory));
74 
75 bool isHotTextMover(const BinaryFunction &Function) {
76   for (std::string &SectionName : opts::HotTextMoveSections) {
77     if (Function.getOriginSectionName() &&
78         *Function.getOriginSectionName() == SectionName)
79       return true;
80   }
81 
82   return false;
83 }
84 
85 static cl::opt<bool> MinBranchClusters(
86     "min-branch-clusters",
87     cl::desc("use a modified clustering algorithm geared towards minimizing "
88              "branches"),
89     cl::Hidden, cl::cat(BoltOptCategory));
90 
91 static cl::list<Peepholes::PeepholeOpts> Peepholes(
92     "peepholes", cl::CommaSeparated, cl::desc("enable peephole optimizations"),
93     cl::value_desc("opt1,opt2,opt3,..."),
94     cl::values(clEnumValN(Peepholes::PEEP_NONE, "none", "disable peepholes"),
95                clEnumValN(Peepholes::PEEP_DOUBLE_JUMPS, "double-jumps",
96                           "remove double jumps when able"),
97                clEnumValN(Peepholes::PEEP_TAILCALL_TRAPS, "tailcall-traps",
98                           "insert tail call traps"),
99                clEnumValN(Peepholes::PEEP_USELESS_BRANCHES, "useless-branches",
100                           "remove useless conditional branches"),
101                clEnumValN(Peepholes::PEEP_ALL, "all",
102                           "enable all peephole optimizations")),
103     cl::ZeroOrMore, cl::cat(BoltOptCategory));
104 
105 static cl::opt<unsigned>
106     PrintFuncStat("print-function-statistics",
107                   cl::desc("print statistics about basic block ordering"),
108                   cl::init(0), cl::cat(BoltOptCategory));
109 
110 static cl::list<bolt::DynoStats::Category>
111     PrintSortedBy("print-sorted-by", cl::CommaSeparated,
112                   cl::desc("print functions sorted by order of dyno stats"),
113                   cl::value_desc("key1,key2,key3,..."),
114                   cl::values(
115 #define D(name, description, ...)                                              \
116   clEnumValN(bolt::DynoStats::name, dynoStatsOptName(bolt::DynoStats::name),   \
117              description),
118                       REAL_DYNO_STATS
119 #undef D
120                           clEnumValN(bolt::DynoStats::LAST_DYNO_STAT, "all",
121                                      "sorted by all names")),
122                   cl::ZeroOrMore, cl::cat(BoltOptCategory));
123 
124 static cl::opt<bool>
125     PrintUnknown("print-unknown",
126                  cl::desc("print names of functions with unknown control flow"),
127                  cl::cat(BoltCategory), cl::Hidden);
128 
129 static cl::opt<bool>
130     PrintUnknownCFG("print-unknown-cfg",
131                     cl::desc("dump CFG of functions with unknown control flow"),
132                     cl::cat(BoltCategory), cl::ReallyHidden);
133 
134 // Please MSVC19 with a forward declaration: otherwise it reports an error about
135 // an undeclared variable inside a callback.
136 extern cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks;
137 cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks(
138     "reorder-blocks", cl::desc("change layout of basic blocks in a function"),
139     cl::init(bolt::ReorderBasicBlocks::LT_NONE),
140     cl::values(
141         clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, "none",
142                    "do not reorder basic blocks"),
143         clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, "reverse",
144                    "layout blocks in reverse order"),
145         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, "normal",
146                    "perform optimal layout based on profile"),
147         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH,
148                    "branch-predictor",
149                    "perform optimal layout prioritizing branch "
150                    "predictions"),
151         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, "cache",
152                    "perform optimal layout prioritizing I-cache "
153                    "behavior"),
154         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS, "cache+",
155                    "perform layout optimizing I-cache behavior"),
156         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "ext-tsp",
157                    "perform layout optimizing I-cache behavior"),
158         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE,
159                    "cluster-shuffle", "perform random layout of clusters")),
160     cl::ZeroOrMore, cl::cat(BoltOptCategory),
161     cl::callback([](const bolt::ReorderBasicBlocks::LayoutType &option) {
162       if (option == bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS) {
163         errs() << "BOLT-WARNING: '-reorder-blocks=cache+' is deprecated, please"
164                << " use '-reorder-blocks=ext-tsp' instead\n";
165         ReorderBlocks = bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP;
166       }
167     }));
168 
169 static cl::opt<unsigned> ReportBadLayout(
170     "report-bad-layout",
171     cl::desc("print top <uint> functions with suboptimal code layout on input"),
172     cl::init(0), cl::Hidden, cl::cat(BoltOptCategory));
173 
174 static cl::opt<bool>
175     ReportStaleFuncs("report-stale",
176                      cl::desc("print the list of functions with stale profile"),
177                      cl::Hidden, cl::cat(BoltOptCategory));
178 
179 enum SctcModes : char {
180   SctcAlways,
181   SctcPreserveDirection,
182   SctcHeuristic
183 };
184 
185 static cl::opt<SctcModes>
186 SctcMode("sctc-mode",
187   cl::desc("mode for simplify conditional tail calls"),
188   cl::init(SctcAlways),
189   cl::values(clEnumValN(SctcAlways, "always", "always perform sctc"),
190     clEnumValN(SctcPreserveDirection,
191       "preserve",
192       "only perform sctc when branch direction is "
193       "preserved"),
194     clEnumValN(SctcHeuristic,
195       "heuristic",
196       "use branch prediction data to control sctc")),
197   cl::ZeroOrMore,
198   cl::cat(BoltOptCategory));
199 
200 static cl::opt<unsigned>
201 StaleThreshold("stale-threshold",
202     cl::desc(
203       "maximum percentage of stale functions to tolerate (default: 100)"),
204     cl::init(100),
205     cl::Hidden,
206     cl::cat(BoltOptCategory));
207 
208 static cl::opt<unsigned> TSPThreshold(
209     "tsp-threshold",
210     cl::desc(
211         "maximum number of hot basic blocks in a function for which to use "
212         "a precise TSP solution while re-ordering basic blocks"),
213     cl::init(10), cl::Hidden, cl::cat(BoltOptCategory));
214 
215 static cl::opt<unsigned> TopCalledLimit(
216     "top-called-limit",
217     cl::desc("maximum number of functions to print in top called "
218              "functions section"),
219     cl::init(100), cl::Hidden, cl::cat(BoltCategory));
220 
221 } // namespace opts
222 
223 namespace llvm {
224 namespace bolt {
225 
226 bool BinaryFunctionPass::shouldOptimize(const BinaryFunction &BF) const {
227   return BF.isSimple() && BF.getState() == BinaryFunction::State::CFG &&
228          !BF.isIgnored();
229 }
230 
231 bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const {
232   return BF.isSimple() && !BF.isIgnored();
233 }
234 
235 void NormalizeCFG::runOnFunction(BinaryFunction &BF) {
236   uint64_t NumRemoved = 0;
237   uint64_t NumDuplicateEdges = 0;
238   uint64_t NeedsFixBranches = 0;
239   for (BinaryBasicBlock &BB : BF) {
240     if (!BB.empty())
241       continue;
242 
243     if (BB.isEntryPoint() || BB.isLandingPad())
244       continue;
245 
246     // Handle a dangling empty block.
247     if (BB.succ_size() == 0) {
248       // If an empty dangling basic block has a predecessor, it could be a
249       // result of codegen for __builtin_unreachable. In such case, do not
250       // remove the block.
251       if (BB.pred_size() == 0) {
252         BB.markValid(false);
253         ++NumRemoved;
254       }
255       continue;
256     }
257 
258     // The block should have just one successor.
259     BinaryBasicBlock *Successor = BB.getSuccessor();
260     assert(Successor && "invalid CFG encountered");
261 
262     // Redirect all predecessors to the successor block.
263     while (!BB.pred_empty()) {
264       BinaryBasicBlock *Predecessor = *BB.pred_begin();
265       if (Predecessor->hasJumpTable())
266         break;
267 
268       if (Predecessor == Successor)
269         break;
270 
271       BinaryBasicBlock::BinaryBranchInfo &BI = Predecessor->getBranchInfo(BB);
272       Predecessor->replaceSuccessor(&BB, Successor, BI.Count,
273                                     BI.MispredictedCount);
274       // We need to fix branches even if we failed to replace all successors
275       // and remove the block.
276       NeedsFixBranches = true;
277     }
278 
279     if (BB.pred_empty()) {
280       BB.removeAllSuccessors();
281       BB.markValid(false);
282       ++NumRemoved;
283     }
284   }
285 
286   if (NumRemoved)
287     BF.eraseInvalidBBs();
288 
289   // Check for duplicate successors. Do it after the empty block elimination as
290   // we can get more duplicate successors.
291   for (BinaryBasicBlock &BB : BF)
292     if (!BB.hasJumpTable() && BB.succ_size() == 2 &&
293         BB.getConditionalSuccessor(false) == BB.getConditionalSuccessor(true))
294       ++NumDuplicateEdges;
295 
296   // fixBranches() will get rid of duplicate edges and update jump instructions.
297   if (NumDuplicateEdges || NeedsFixBranches)
298     BF.fixBranches();
299 
300   NumDuplicateEdgesMerged += NumDuplicateEdges;
301   NumBlocksRemoved += NumRemoved;
302 }
303 
304 void NormalizeCFG::runOnFunctions(BinaryContext &BC) {
305   ParallelUtilities::runOnEachFunction(
306       BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR,
307       [&](BinaryFunction &BF) { runOnFunction(BF); },
308       [&](const BinaryFunction &BF) { return !shouldOptimize(BF); },
309       "NormalizeCFG");
310   if (NumBlocksRemoved)
311     outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block"
312            << (NumBlocksRemoved == 1 ? "" : "s") << '\n';
313   if (NumDuplicateEdgesMerged)
314     outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged
315            << " duplicate CFG edge" << (NumDuplicateEdgesMerged == 1 ? "" : "s")
316            << '\n';
317 }
318 
319 void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) {
320   if (!Function.getLayout().block_empty()) {
321     unsigned Count;
322     uint64_t Bytes;
323     Function.markUnreachableBlocks();
324     LLVM_DEBUG({
325       for (BinaryBasicBlock &BB : Function) {
326         if (!BB.isValid()) {
327           dbgs() << "BOLT-INFO: UCE found unreachable block " << BB.getName()
328                  << " in function " << Function << "\n";
329           Function.dump();
330         }
331       }
332     });
333     std::tie(Count, Bytes) = Function.eraseInvalidBBs();
334     DeletedBlocks += Count;
335     DeletedBytes += Bytes;
336     if (Count) {
337       Modified.insert(&Function);
338       if (opts::Verbosity > 0)
339         outs() << "BOLT-INFO: removed " << Count
340                << " dead basic block(s) accounting for " << Bytes
341                << " bytes in function " << Function << '\n';
342     }
343   }
344 }
345 
346 void EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) {
347   for (auto &It : BC.getBinaryFunctions()) {
348     BinaryFunction &Function = It.second;
349     if (shouldOptimize(Function))
350       runOnFunction(Function);
351   }
352 
353   if (DeletedBlocks)
354     outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and "
355            << DeletedBytes << " bytes of code\n";
356 }
357 
358 bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const {
359   return (BinaryFunctionPass::shouldPrint(BF) &&
360           opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE);
361 }
362 
363 bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction &BF) const {
364   // Apply execution count threshold
365   if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
366     return false;
367 
368   return BinaryFunctionPass::shouldOptimize(BF);
369 }
370 
371 void ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
372   if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE)
373     return;
374 
375   std::atomic_uint64_t ModifiedFuncCount(0);
376   std::mutex FunctionEditDistanceMutex;
377   DenseMap<const BinaryFunction *, uint64_t> FunctionEditDistance;
378 
379   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
380     SmallVector<const BinaryBasicBlock *, 0> OldBlockOrder;
381     if (opts::PrintFuncStat > 0)
382       llvm::copy(BF.getLayout().blocks(), std::back_inserter(OldBlockOrder));
383 
384     const bool LayoutChanged =
385         modifyFunctionLayout(BF, opts::ReorderBlocks, opts::MinBranchClusters);
386     if (LayoutChanged) {
387       ModifiedFuncCount.fetch_add(1, std::memory_order_relaxed);
388       if (opts::PrintFuncStat > 0) {
389         const uint64_t Distance = BF.getLayout().getEditDistance(OldBlockOrder);
390         std::lock_guard<std::mutex> Lock(FunctionEditDistanceMutex);
391         FunctionEditDistance[&BF] = Distance;
392       }
393     }
394   };
395 
396   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
397     return !shouldOptimize(BF);
398   };
399 
400   ParallelUtilities::runOnEachFunction(
401       BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc,
402       "ReorderBasicBlocks");
403   const size_t NumAllProfiledFunctions =
404       BC.NumProfiledFuncs + BC.NumStaleProfileFuncs;
405 
406   outs() << "BOLT-INFO: basic block reordering modified layout of "
407          << format("%zu functions (%.2lf%% of profiled, %.2lf%% of total)\n",
408                    ModifiedFuncCount.load(std::memory_order_relaxed),
409                    100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) /
410                        NumAllProfiledFunctions,
411                    100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) /
412                        BC.getBinaryFunctions().size());
413 
414   if (opts::PrintFuncStat > 0) {
415     raw_ostream &OS = outs();
416     // Copy all the values into vector in order to sort them
417     std::map<uint64_t, BinaryFunction &> ScoreMap;
418     auto &BFs = BC.getBinaryFunctions();
419     for (auto It = BFs.begin(); It != BFs.end(); ++It)
420       ScoreMap.insert(std::pair<uint64_t, BinaryFunction &>(
421           It->second.getFunctionScore(), It->second));
422 
423     OS << "\nBOLT-INFO: Printing Function Statistics:\n\n";
424     OS << "           There are " << BFs.size() << " functions in total. \n";
425     OS << "           Number of functions being modified: "
426        << ModifiedFuncCount.load(std::memory_order_relaxed) << "\n";
427     OS << "           User asks for detailed information on top "
428        << opts::PrintFuncStat << " functions. (Ranked by function score)"
429        << "\n\n";
430     uint64_t I = 0;
431     for (std::map<uint64_t, BinaryFunction &>::reverse_iterator Rit =
432              ScoreMap.rbegin();
433          Rit != ScoreMap.rend() && I < opts::PrintFuncStat; ++Rit, ++I) {
434       BinaryFunction &Function = Rit->second;
435 
436       OS << "           Information for function of top: " << (I + 1) << ": \n";
437       OS << "             Function Score is: " << Function.getFunctionScore()
438          << "\n";
439       OS << "             There are " << Function.size()
440          << " number of blocks in this function.\n";
441       OS << "             There are " << Function.getInstructionCount()
442          << " number of instructions in this function.\n";
443       OS << "             The edit distance for this function is: "
444          << FunctionEditDistance.lookup(&Function) << "\n\n";
445     }
446   }
447 }
448 
449 bool ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF,
450                                               LayoutType Type,
451                                               bool MinBranchClusters) const {
452   if (BF.size() == 0 || Type == LT_NONE)
453     return false;
454 
455   BinaryFunction::BasicBlockOrderType NewLayout;
456   std::unique_ptr<ReorderAlgorithm> Algo;
457 
458   // Cannot do optimal layout without profile.
459   if (Type != LT_REVERSE && !BF.hasValidProfile())
460     return false;
461 
462   if (Type == LT_REVERSE) {
463     Algo.reset(new ReverseReorderAlgorithm());
464   } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) {
465     // Work on optimal solution if problem is small enough
466     LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n");
467     Algo.reset(new TSPReorderAlgorithm());
468   } else {
469     LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n");
470 
471     std::unique_ptr<ClusterAlgorithm> CAlgo;
472     if (MinBranchClusters)
473       CAlgo.reset(new MinBranchGreedyClusterAlgorithm());
474     else
475       CAlgo.reset(new PHGreedyClusterAlgorithm());
476 
477     switch (Type) {
478     case LT_OPTIMIZE:
479       Algo.reset(new OptimizeReorderAlgorithm(std::move(CAlgo)));
480       break;
481 
482     case LT_OPTIMIZE_BRANCH:
483       Algo.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo)));
484       break;
485 
486     case LT_OPTIMIZE_CACHE:
487       Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo)));
488       break;
489 
490     case LT_OPTIMIZE_EXT_TSP:
491       Algo.reset(new ExtTSPReorderAlgorithm());
492       break;
493 
494     case LT_OPTIMIZE_SHUFFLE:
495       Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo)));
496       break;
497 
498     default:
499       llvm_unreachable("unexpected layout type");
500     }
501   }
502 
503   Algo->reorderBasicBlocks(BF, NewLayout);
504 
505   return BF.getLayout().update(NewLayout);
506 }
507 
508 void FixupBranches::runOnFunctions(BinaryContext &BC) {
509   for (auto &It : BC.getBinaryFunctions()) {
510     BinaryFunction &Function = It.second;
511     if (!BC.shouldEmit(Function) || !Function.isSimple())
512       continue;
513 
514     Function.fixBranches();
515   }
516 }
517 
518 void FinalizeFunctions::runOnFunctions(BinaryContext &BC) {
519   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
520     if (!BF.finalizeCFIState()) {
521       if (BC.HasRelocations) {
522         errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF
523                << ". Exiting.\n";
524         exit(1);
525       }
526       BF.setSimple(false);
527       return;
528     }
529 
530     BF.setFinalized();
531 
532     // Update exception handling information.
533     BF.updateEHRanges();
534   };
535 
536   ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
537     return !BC.shouldEmit(BF);
538   };
539 
540   ParallelUtilities::runOnEachFunction(
541       BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun,
542       SkipPredicate, "FinalizeFunctions");
543 }
544 
545 void CheckLargeFunctions::runOnFunctions(BinaryContext &BC) {
546   if (BC.HasRelocations)
547     return;
548 
549   if (!opts::UpdateDebugSections)
550     return;
551 
552   // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit
553   // incorrect debug info.
554   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
555     uint64_t HotSize, ColdSize;
556     std::tie(HotSize, ColdSize) =
557         BC.calculateEmittedSize(BF, /*FixBranches=*/false);
558     if (HotSize > BF.getMaxSize())
559       BF.setSimple(false);
560   };
561 
562   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
563     return !shouldOptimize(BF);
564   };
565 
566   ParallelUtilities::runOnEachFunction(
567       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
568       SkipFunc, "CheckLargeFunctions");
569 }
570 
571 bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const {
572   // Unlike other passes, allow functions in non-CFG state.
573   return BF.isSimple() && !BF.isIgnored();
574 }
575 
576 void LowerAnnotations::runOnFunctions(BinaryContext &BC) {
577   std::vector<std::pair<MCInst *, uint32_t>> PreservedOffsetAnnotations;
578   std::vector<std::pair<MCInst *, MCSymbol *>> PreservedLabelAnnotations;
579 
580   for (auto &It : BC.getBinaryFunctions()) {
581     BinaryFunction &BF = It.second;
582 
583     for (FunctionFragment &FF : BF.getLayout().fragments()) {
584       int64_t CurrentGnuArgsSize = 0;
585 
586       for (BinaryBasicBlock *const BB : FF) {
587         // First convert GnuArgsSize annotations into CFIs. This may change
588         // instr pointers, so do it before recording ptrs for preserved
589         // annotations
590         if (BF.usesGnuArgsSize()) {
591           for (auto II = BB->begin(); II != BB->end(); ++II) {
592             if (!BC.MIB->isInvoke(*II))
593               continue;
594             const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(*II);
595             assert(NewGnuArgsSize >= 0 &&
596                    "expected non-negative GNU_args_size");
597             if (NewGnuArgsSize != CurrentGnuArgsSize) {
598               auto InsertII = BF.addCFIInstruction(
599                   BB, II,
600                   MCCFIInstruction::createGnuArgsSize(nullptr, NewGnuArgsSize));
601               CurrentGnuArgsSize = NewGnuArgsSize;
602               II = std::next(InsertII);
603             }
604           }
605         }
606 
607         // Now record preserved annotations separately and then strip
608         // annotations.
609         for (auto II = BB->begin(); II != BB->end(); ++II) {
610           if (BF.requiresAddressTranslation() && BC.MIB->getOffset(*II))
611             PreservedOffsetAnnotations.emplace_back(&(*II),
612                                                     *BC.MIB->getOffset(*II));
613           if (MCSymbol *Label = BC.MIB->getLabel(*II))
614             PreservedLabelAnnotations.emplace_back(&*II, Label);
615           BC.MIB->stripAnnotations(*II);
616         }
617       }
618     }
619   }
620   for (BinaryFunction *BF : BC.getInjectedBinaryFunctions())
621     for (BinaryBasicBlock &BB : *BF)
622       for (MCInst &Instruction : BB) {
623         if (MCSymbol *Label = BC.MIB->getLabel(Instruction))
624           PreservedLabelAnnotations.emplace_back(&Instruction, Label);
625         BC.MIB->stripAnnotations(Instruction);
626       }
627 
628   // Release all memory taken by annotations
629   BC.MIB->freeAnnotations();
630 
631   // Reinsert preserved annotations we need during code emission.
632   for (const std::pair<MCInst *, uint32_t> &Item : PreservedOffsetAnnotations)
633     BC.MIB->setOffset(*Item.first, Item.second);
634   for (auto [Instr, Label] : PreservedLabelAnnotations)
635     BC.MIB->setLabel(*Instr, Label);
636 }
637 
638 // Check for dirty state in MCSymbol objects that might be a consequence
639 // of running calculateEmittedSize() in parallel, during split functions
640 // pass. If an inconsistent state is found (symbol already registered or
641 // already defined), clean it.
642 void CleanMCState::runOnFunctions(BinaryContext &BC) {
643   MCContext &Ctx = *BC.Ctx;
644   for (const auto &SymMapEntry : Ctx.getSymbols()) {
645     const MCSymbol *S = SymMapEntry.second;
646     if (S->isDefined()) {
647       LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName()
648                         << "\" is already defined\n");
649       const_cast<MCSymbol *>(S)->setUndefined();
650     }
651     if (S->isRegistered()) {
652       LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName()
653                         << "\" is already registered\n");
654       const_cast<MCSymbol *>(S)->setIsRegistered(false);
655     }
656     LLVM_DEBUG(if (S->isVariable()) {
657       dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() << "\" is variable\n";
658     });
659   }
660 }
661 
662 // This peephole fixes jump instructions that jump to another basic
663 // block with a single jump instruction, e.g.
664 //
665 // B0: ...
666 //     jmp  B1   (or jcc B1)
667 //
668 // B1: jmp  B2
669 //
670 // ->
671 //
672 // B0: ...
673 //     jmp  B2   (or jcc B2)
674 //
675 static uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) {
676   uint64_t NumDoubleJumps = 0;
677 
678   MCContext *Ctx = Function.getBinaryContext().Ctx.get();
679   MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
680   for (BinaryBasicBlock &BB : Function) {
681     auto checkAndPatch = [&](BinaryBasicBlock *Pred, BinaryBasicBlock *Succ,
682                              const MCSymbol *SuccSym) {
683       // Ignore infinite loop jumps or fallthrough tail jumps.
684       if (Pred == Succ || Succ == &BB)
685         return false;
686 
687       if (Succ) {
688         const MCSymbol *TBB = nullptr;
689         const MCSymbol *FBB = nullptr;
690         MCInst *CondBranch = nullptr;
691         MCInst *UncondBranch = nullptr;
692         bool Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
693         if (!Res) {
694           LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n";
695                      Pred->dump());
696           return false;
697         }
698         Pred->replaceSuccessor(&BB, Succ);
699 
700         // We must patch up any existing branch instructions to match up
701         // with the new successor.
702         assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) &&
703                "Predecessor block has inconsistent number of successors");
704         if (CondBranch && MIB->getTargetSymbol(*CondBranch) == BB.getLabel()) {
705           MIB->replaceBranchTarget(*CondBranch, Succ->getLabel(), Ctx);
706         } else if (UncondBranch &&
707                    MIB->getTargetSymbol(*UncondBranch) == BB.getLabel()) {
708           MIB->replaceBranchTarget(*UncondBranch, Succ->getLabel(), Ctx);
709         } else if (!UncondBranch) {
710           assert(Function.getLayout().getBasicBlockAfter(Pred, false) != Succ &&
711                  "Don't add an explicit jump to a fallthrough block.");
712           Pred->addBranchInstruction(Succ);
713         }
714       } else {
715         // Succ will be null in the tail call case.  In this case we
716         // need to explicitly add a tail call instruction.
717         MCInst *Branch = Pred->getLastNonPseudoInstr();
718         if (Branch && MIB->isUnconditionalBranch(*Branch)) {
719           assert(MIB->getTargetSymbol(*Branch) == BB.getLabel());
720           Pred->removeSuccessor(&BB);
721           Pred->eraseInstruction(Pred->findInstruction(Branch));
722           Pred->addTailCallInstruction(SuccSym);
723         } else {
724           return false;
725         }
726       }
727 
728       ++NumDoubleJumps;
729       LLVM_DEBUG(dbgs() << "Removed double jump in " << Function << " from "
730                         << Pred->getName() << " -> " << BB.getName() << " to "
731                         << Pred->getName() << " -> " << SuccSym->getName()
732                         << (!Succ ? " (tail)\n" : "\n"));
733 
734       return true;
735     };
736 
737     if (BB.getNumNonPseudos() != 1 || BB.isLandingPad())
738       continue;
739 
740     MCInst *Inst = BB.getFirstNonPseudoInstr();
741     const bool IsTailCall = MIB->isTailCall(*Inst);
742 
743     if (!MIB->isUnconditionalBranch(*Inst) && !IsTailCall)
744       continue;
745 
746     // If we operate after SCTC make sure it's not a conditional tail call.
747     if (IsTailCall && MIB->isConditionalBranch(*Inst))
748       continue;
749 
750     const MCSymbol *SuccSym = MIB->getTargetSymbol(*Inst);
751     BinaryBasicBlock *Succ = BB.getSuccessor();
752 
753     if (((!Succ || &BB == Succ) && !IsTailCall) || (IsTailCall && !SuccSym))
754       continue;
755 
756     std::vector<BinaryBasicBlock *> Preds = {BB.pred_begin(), BB.pred_end()};
757 
758     for (BinaryBasicBlock *Pred : Preds) {
759       if (Pred->isLandingPad())
760         continue;
761 
762       if (Pred->getSuccessor() == &BB ||
763           (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) ||
764           Pred->getConditionalSuccessor(false) == &BB)
765         if (checkAndPatch(Pred, Succ, SuccSym) && MarkInvalid)
766           BB.markValid(BB.pred_size() != 0 || BB.isLandingPad() ||
767                        BB.isEntryPoint());
768     }
769   }
770 
771   return NumDoubleJumps;
772 }
773 
774 bool SimplifyConditionalTailCalls::shouldRewriteBranch(
775     const BinaryBasicBlock *PredBB, const MCInst &CondBranch,
776     const BinaryBasicBlock *BB, const bool DirectionFlag) {
777   if (BeenOptimized.count(PredBB))
778     return false;
779 
780   const bool IsForward = BinaryFunction::isForwardBranch(PredBB, BB);
781 
782   if (IsForward)
783     ++NumOrigForwardBranches;
784   else
785     ++NumOrigBackwardBranches;
786 
787   if (opts::SctcMode == opts::SctcAlways)
788     return true;
789 
790   if (opts::SctcMode == opts::SctcPreserveDirection)
791     return IsForward == DirectionFlag;
792 
793   const ErrorOr<std::pair<double, double>> Frequency =
794       PredBB->getBranchStats(BB);
795 
796   // It's ok to rewrite the conditional branch if the new target will be
797   // a backward branch.
798 
799   // If no data available for these branches, then it should be ok to
800   // do the optimization since it will reduce code size.
801   if (Frequency.getError())
802     return true;
803 
804   // TODO: should this use misprediction frequency instead?
805   const bool Result = (IsForward && Frequency.get().first >= 0.5) ||
806                       (!IsForward && Frequency.get().first <= 0.5);
807 
808   return Result == DirectionFlag;
809 }
810 
811 uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) {
812   // Need updated indices to correctly detect branch' direction.
813   BF.getLayout().updateLayoutIndices();
814   BF.markUnreachableBlocks();
815 
816   MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get();
817   MCContext *Ctx = BF.getBinaryContext().Ctx.get();
818   uint64_t NumLocalCTCCandidates = 0;
819   uint64_t NumLocalCTCs = 0;
820   uint64_t LocalCTCTakenCount = 0;
821   uint64_t LocalCTCExecCount = 0;
822   std::vector<std::pair<BinaryBasicBlock *, const BinaryBasicBlock *>>
823       NeedsUncondBranch;
824 
825   // Will block be deleted by UCE?
826   auto isValid = [](const BinaryBasicBlock *BB) {
827     return (BB->pred_size() != 0 || BB->isLandingPad() || BB->isEntryPoint());
828   };
829 
830   for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
831     // Locate BB with a single direct tail-call instruction.
832     if (BB->getNumNonPseudos() != 1)
833       continue;
834 
835     MCInst *Instr = BB->getFirstNonPseudoInstr();
836     if (!MIB->isTailCall(*Instr) || MIB->isConditionalBranch(*Instr))
837       continue;
838 
839     const MCSymbol *CalleeSymbol = MIB->getTargetSymbol(*Instr);
840     if (!CalleeSymbol)
841       continue;
842 
843     // Detect direction of the possible conditional tail call.
844     const bool IsForwardCTC = BF.isForwardCall(CalleeSymbol);
845 
846     // Iterate through all predecessors.
847     for (BinaryBasicBlock *PredBB : BB->predecessors()) {
848       BinaryBasicBlock *CondSucc = PredBB->getConditionalSuccessor(true);
849       if (!CondSucc)
850         continue;
851 
852       ++NumLocalCTCCandidates;
853 
854       const MCSymbol *TBB = nullptr;
855       const MCSymbol *FBB = nullptr;
856       MCInst *CondBranch = nullptr;
857       MCInst *UncondBranch = nullptr;
858       bool Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
859 
860       // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz
861       if (!Result) {
862         LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n";
863                    PredBB->dump());
864         continue;
865       }
866 
867       assert(Result && "internal error analyzing conditional branch");
868       assert(CondBranch && "conditional branch expected");
869 
870       // It's possible that PredBB is also a successor to BB that may have
871       // been processed by a previous iteration of the SCTC loop, in which
872       // case it may have been marked invalid.  We should skip rewriting in
873       // this case.
874       if (!PredBB->isValid()) {
875         assert(PredBB->isSuccessor(BB) &&
876                "PredBB should be valid if it is not a successor to BB");
877         continue;
878       }
879 
880       // We don't want to reverse direction of the branch in new order
881       // without further profile analysis.
882       const bool DirectionFlag = CondSucc == BB ? IsForwardCTC : !IsForwardCTC;
883       if (!shouldRewriteBranch(PredBB, *CondBranch, BB, DirectionFlag))
884         continue;
885 
886       // Record this block so that we don't try to optimize it twice.
887       BeenOptimized.insert(PredBB);
888 
889       uint64_t Count = 0;
890       if (CondSucc != BB) {
891         // Patch the new target address into the conditional branch.
892         MIB->reverseBranchCondition(*CondBranch, CalleeSymbol, Ctx);
893         // Since we reversed the condition on the branch we need to change
894         // the target for the unconditional branch or add a unconditional
895         // branch to the old target.  This has to be done manually since
896         // fixupBranches is not called after SCTC.
897         NeedsUncondBranch.emplace_back(PredBB, CondSucc);
898         Count = PredBB->getFallthroughBranchInfo().Count;
899       } else {
900         // Change destination of the conditional branch.
901         MIB->replaceBranchTarget(*CondBranch, CalleeSymbol, Ctx);
902         Count = PredBB->getTakenBranchInfo().Count;
903       }
904       const uint64_t CTCTakenFreq =
905           Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count;
906 
907       // Annotate it, so "isCall" returns true for this jcc
908       MIB->setConditionalTailCall(*CondBranch);
909       // Add info about the conditional tail call frequency, otherwise this
910       // info will be lost when we delete the associated BranchInfo entry
911       auto &CTCAnnotation =
912           MIB->getOrCreateAnnotationAs<uint64_t>(*CondBranch, "CTCTakenCount");
913       CTCAnnotation = CTCTakenFreq;
914 
915       // Remove the unused successor which may be eliminated later
916       // if there are no other users.
917       PredBB->removeSuccessor(BB);
918       // Update BB execution count
919       if (CTCTakenFreq && CTCTakenFreq <= BB->getKnownExecutionCount())
920         BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq);
921       else if (CTCTakenFreq > BB->getKnownExecutionCount())
922         BB->setExecutionCount(0);
923 
924       ++NumLocalCTCs;
925       LocalCTCTakenCount += CTCTakenFreq;
926       LocalCTCExecCount += PredBB->getKnownExecutionCount();
927     }
928 
929     // Remove the block from CFG if all predecessors were removed.
930     BB->markValid(isValid(BB));
931   }
932 
933   // Add unconditional branches at the end of BBs to new successors
934   // as long as the successor is not a fallthrough.
935   for (auto &Entry : NeedsUncondBranch) {
936     BinaryBasicBlock *PredBB = Entry.first;
937     const BinaryBasicBlock *CondSucc = Entry.second;
938 
939     const MCSymbol *TBB = nullptr;
940     const MCSymbol *FBB = nullptr;
941     MCInst *CondBranch = nullptr;
942     MCInst *UncondBranch = nullptr;
943     PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
944 
945     // Find the next valid block.  Invalid blocks will be deleted
946     // so they shouldn't be considered fallthrough targets.
947     const BinaryBasicBlock *NextBlock =
948         BF.getLayout().getBasicBlockAfter(PredBB, false);
949     while (NextBlock && !isValid(NextBlock))
950       NextBlock = BF.getLayout().getBasicBlockAfter(NextBlock, false);
951 
952     // Get the unconditional successor to this block.
953     const BinaryBasicBlock *PredSucc = PredBB->getSuccessor();
954     assert(PredSucc && "The other branch should be a tail call");
955 
956     const bool HasFallthrough = (NextBlock && PredSucc == NextBlock);
957 
958     if (UncondBranch) {
959       if (HasFallthrough)
960         PredBB->eraseInstruction(PredBB->findInstruction(UncondBranch));
961       else
962         MIB->replaceBranchTarget(*UncondBranch, CondSucc->getLabel(), Ctx);
963     } else if (!HasFallthrough) {
964       MCInst Branch;
965       MIB->createUncondBranch(Branch, CondSucc->getLabel(), Ctx);
966       PredBB->addInstruction(Branch);
967     }
968   }
969 
970   if (NumLocalCTCs > 0) {
971     NumDoubleJumps += fixDoubleJumps(BF, true);
972     // Clean-up unreachable tail-call blocks.
973     const std::pair<unsigned, uint64_t> Stats = BF.eraseInvalidBBs();
974     DeletedBlocks += Stats.first;
975     DeletedBytes += Stats.second;
976 
977     assert(BF.validateCFG());
978   }
979 
980   LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs
981                     << " conditional tail calls from a total of "
982                     << NumLocalCTCCandidates << " candidates in function " << BF
983                     << ". CTCs execution count for this function is "
984                     << LocalCTCExecCount << " and CTC taken count is "
985                     << LocalCTCTakenCount << "\n";);
986 
987   NumTailCallsPatched += NumLocalCTCs;
988   NumCandidateTailCalls += NumLocalCTCCandidates;
989   CTCExecCount += LocalCTCExecCount;
990   CTCTakenCount += LocalCTCTakenCount;
991 
992   return NumLocalCTCs > 0;
993 }
994 
995 void SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) {
996   if (!BC.isX86())
997     return;
998 
999   for (auto &It : BC.getBinaryFunctions()) {
1000     BinaryFunction &Function = It.second;
1001 
1002     if (!shouldOptimize(Function))
1003       continue;
1004 
1005     if (fixTailCalls(Function)) {
1006       Modified.insert(&Function);
1007       Function.setHasCanonicalCFG(false);
1008     }
1009   }
1010 
1011   if (NumTailCallsPatched)
1012     outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched
1013            << " tail calls (" << NumOrigForwardBranches << " forward)"
1014            << " tail calls (" << NumOrigBackwardBranches << " backward)"
1015            << " from a total of " << NumCandidateTailCalls << " while removing "
1016            << NumDoubleJumps << " double jumps"
1017            << " and removing " << DeletedBlocks << " basic blocks"
1018            << " totalling " << DeletedBytes
1019            << " bytes of code. CTCs total execution count is " << CTCExecCount
1020            << " and the number of times CTCs are taken is " << CTCTakenCount
1021            << "\n";
1022 }
1023 
1024 uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) {
1025   uint64_t Count = 0;
1026   const BinaryContext &BC = Function.getBinaryContext();
1027   for (BinaryBasicBlock &BB : Function) {
1028     for (MCInst &Inst : BB) {
1029       MCInst OriginalInst;
1030       if (opts::Verbosity > 2)
1031         OriginalInst = Inst;
1032 
1033       if (!BC.MIB->shortenInstruction(Inst, *BC.STI))
1034         continue;
1035 
1036       if (opts::Verbosity > 2) {
1037         BC.scopeLock();
1038         outs() << "BOLT-INFO: shortening:\nBOLT-INFO:    ";
1039         BC.printInstruction(outs(), OriginalInst, 0, &Function);
1040         outs() << "BOLT-INFO: to:";
1041         BC.printInstruction(outs(), Inst, 0, &Function);
1042       }
1043 
1044       ++Count;
1045     }
1046   }
1047 
1048   return Count;
1049 }
1050 
1051 void ShortenInstructions::runOnFunctions(BinaryContext &BC) {
1052   std::atomic<uint64_t> NumShortened{0};
1053   if (!BC.isX86())
1054     return;
1055 
1056   ParallelUtilities::runOnEachFunction(
1057       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR,
1058       [&](BinaryFunction &BF) { NumShortened += shortenInstructions(BF); },
1059       nullptr, "ShortenInstructions");
1060 
1061   if (NumShortened)
1062     outs() << "BOLT-INFO: " << NumShortened << " instructions were shortened\n";
1063 }
1064 
1065 void Peepholes::addTailcallTraps(BinaryFunction &Function) {
1066   MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
1067   for (BinaryBasicBlock &BB : Function) {
1068     MCInst *Inst = BB.getLastNonPseudoInstr();
1069     if (Inst && MIB->isTailCall(*Inst) && MIB->isIndirectBranch(*Inst)) {
1070       MCInst Trap;
1071       if (MIB->createTrap(Trap)) {
1072         BB.addInstruction(Trap);
1073         ++TailCallTraps;
1074       }
1075     }
1076   }
1077 }
1078 
1079 void Peepholes::removeUselessCondBranches(BinaryFunction &Function) {
1080   for (BinaryBasicBlock &BB : Function) {
1081     if (BB.succ_size() != 2)
1082       continue;
1083 
1084     BinaryBasicBlock *CondBB = BB.getConditionalSuccessor(true);
1085     BinaryBasicBlock *UncondBB = BB.getConditionalSuccessor(false);
1086     if (CondBB != UncondBB)
1087       continue;
1088 
1089     const MCSymbol *TBB = nullptr;
1090     const MCSymbol *FBB = nullptr;
1091     MCInst *CondBranch = nullptr;
1092     MCInst *UncondBranch = nullptr;
1093     bool Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
1094 
1095     // analyzeBranch() can fail due to unusual branch instructions,
1096     // e.g. jrcxz, or jump tables (indirect jump).
1097     if (!Result || !CondBranch)
1098       continue;
1099 
1100     BB.removeDuplicateConditionalSuccessor(CondBranch);
1101     ++NumUselessCondBranches;
1102   }
1103 }
1104 
1105 void Peepholes::runOnFunctions(BinaryContext &BC) {
1106   const char Opts =
1107       std::accumulate(opts::Peepholes.begin(), opts::Peepholes.end(), 0,
1108                       [](const char A, const PeepholeOpts B) { return A | B; });
1109   if (Opts == PEEP_NONE)
1110     return;
1111 
1112   for (auto &It : BC.getBinaryFunctions()) {
1113     BinaryFunction &Function = It.second;
1114     if (shouldOptimize(Function)) {
1115       if (Opts & PEEP_DOUBLE_JUMPS)
1116         NumDoubleJumps += fixDoubleJumps(Function, false);
1117       if (Opts & PEEP_TAILCALL_TRAPS)
1118         addTailcallTraps(Function);
1119       if (Opts & PEEP_USELESS_BRANCHES)
1120         removeUselessCondBranches(Function);
1121       assert(Function.validateCFG());
1122     }
1123   }
1124   outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps
1125          << " double jumps patched.\n"
1126          << "BOLT-INFO: Peephole: " << TailCallTraps
1127          << " tail call traps inserted.\n"
1128          << "BOLT-INFO: Peephole: " << NumUselessCondBranches
1129          << " useless conditional branches removed.\n";
1130 }
1131 
1132 bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) {
1133   BinaryContext &BC = BF.getBinaryContext();
1134   MCPlusBuilder *MIB = BC.MIB.get();
1135 
1136   uint64_t NumLocalLoadsSimplified = 0;
1137   uint64_t NumDynamicLocalLoadsSimplified = 0;
1138   uint64_t NumLocalLoadsFound = 0;
1139   uint64_t NumDynamicLocalLoadsFound = 0;
1140 
1141   for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
1142     for (MCInst &Inst : *BB) {
1143       unsigned Opcode = Inst.getOpcode();
1144       const MCInstrDesc &Desc = BC.MII->get(Opcode);
1145 
1146       // Skip instructions that do not load from memory.
1147       if (!Desc.mayLoad())
1148         continue;
1149 
1150       // Try to statically evaluate the target memory address;
1151       uint64_t TargetAddress;
1152 
1153       if (MIB->hasPCRelOperand(Inst)) {
1154         // Try to find the symbol that corresponds to the PC-relative operand.
1155         MCOperand *DispOpI = MIB->getMemOperandDisp(Inst);
1156         assert(DispOpI != Inst.end() && "expected PC-relative displacement");
1157         assert(DispOpI->isExpr() &&
1158                "found PC-relative with non-symbolic displacement");
1159 
1160         // Get displacement symbol.
1161         const MCSymbol *DisplSymbol;
1162         uint64_t DisplOffset;
1163 
1164         std::tie(DisplSymbol, DisplOffset) =
1165             MIB->getTargetSymbolInfo(DispOpI->getExpr());
1166 
1167         if (!DisplSymbol)
1168           continue;
1169 
1170         // Look up the symbol address in the global symbols map of the binary
1171         // context object.
1172         BinaryData *BD = BC.getBinaryDataByName(DisplSymbol->getName());
1173         if (!BD)
1174           continue;
1175         TargetAddress = BD->getAddress() + DisplOffset;
1176       } else if (!MIB->evaluateMemOperandTarget(Inst, TargetAddress)) {
1177         continue;
1178       }
1179 
1180       // Get the contents of the section containing the target address of the
1181       // memory operand. We are only interested in read-only sections.
1182       ErrorOr<BinarySection &> DataSection =
1183           BC.getSectionForAddress(TargetAddress);
1184       if (!DataSection || DataSection->isWritable())
1185         continue;
1186 
1187       if (BC.getRelocationAt(TargetAddress) ||
1188           BC.getDynamicRelocationAt(TargetAddress))
1189         continue;
1190 
1191       uint32_t Offset = TargetAddress - DataSection->getAddress();
1192       StringRef ConstantData = DataSection->getContents();
1193 
1194       ++NumLocalLoadsFound;
1195       if (BB->hasProfile())
1196         NumDynamicLocalLoadsFound += BB->getExecutionCount();
1197 
1198       if (MIB->replaceMemOperandWithImm(Inst, ConstantData, Offset)) {
1199         ++NumLocalLoadsSimplified;
1200         if (BB->hasProfile())
1201           NumDynamicLocalLoadsSimplified += BB->getExecutionCount();
1202       }
1203     }
1204   }
1205 
1206   NumLoadsFound += NumLocalLoadsFound;
1207   NumDynamicLoadsFound += NumDynamicLocalLoadsFound;
1208   NumLoadsSimplified += NumLocalLoadsSimplified;
1209   NumDynamicLoadsSimplified += NumDynamicLocalLoadsSimplified;
1210 
1211   return NumLocalLoadsSimplified > 0;
1212 }
1213 
1214 void SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) {
1215   for (auto &It : BC.getBinaryFunctions()) {
1216     BinaryFunction &Function = It.second;
1217     if (shouldOptimize(Function) && simplifyRODataLoads(Function))
1218       Modified.insert(&Function);
1219   }
1220 
1221   outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of "
1222          << NumLoadsFound << " loads from a statically computed address.\n"
1223          << "BOLT-INFO: dynamic loads simplified: " << NumDynamicLoadsSimplified
1224          << "\n"
1225          << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound << "\n";
1226 }
1227 
1228 void AssignSections::runOnFunctions(BinaryContext &BC) {
1229   for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) {
1230     Function->setCodeSectionName(BC.getInjectedCodeSectionName());
1231     Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName());
1232   }
1233 
1234   // In non-relocation mode functions have pre-assigned section names.
1235   if (!BC.HasRelocations)
1236     return;
1237 
1238   const bool UseColdSection =
1239       BC.NumProfiledFuncs > 0 ||
1240       opts::ReorderFunctions == ReorderFunctions::RT_USER;
1241   for (auto &BFI : BC.getBinaryFunctions()) {
1242     BinaryFunction &Function = BFI.second;
1243     if (opts::isHotTextMover(Function)) {
1244       Function.setCodeSectionName(BC.getHotTextMoverSectionName());
1245       Function.setColdCodeSectionName(BC.getHotTextMoverSectionName());
1246       continue;
1247     }
1248 
1249     if (!UseColdSection || Function.hasValidIndex())
1250       Function.setCodeSectionName(BC.getMainCodeSectionName());
1251     else
1252       Function.setCodeSectionName(BC.getColdCodeSectionName());
1253 
1254     if (Function.isSplit())
1255       Function.setColdCodeSectionName(BC.getColdCodeSectionName());
1256   }
1257 }
1258 
1259 void PrintProfileStats::runOnFunctions(BinaryContext &BC) {
1260   double FlowImbalanceMean = 0.0;
1261   size_t NumBlocksConsidered = 0;
1262   double WorstBias = 0.0;
1263   const BinaryFunction *WorstBiasFunc = nullptr;
1264 
1265   // For each function CFG, we fill an IncomingMap with the sum of the frequency
1266   // of incoming edges for each BB. Likewise for each OutgoingMap and the sum
1267   // of the frequency of outgoing edges.
1268   using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>;
1269   std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps;
1270   std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps;
1271 
1272   // Compute mean
1273   for (const auto &BFI : BC.getBinaryFunctions()) {
1274     const BinaryFunction &Function = BFI.second;
1275     if (Function.empty() || !Function.isSimple())
1276       continue;
1277     FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
1278     FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
1279     for (const BinaryBasicBlock &BB : Function) {
1280       uint64_t TotalOutgoing = 0ULL;
1281       auto SuccBIIter = BB.branch_info_begin();
1282       for (BinaryBasicBlock *Succ : BB.successors()) {
1283         uint64_t Count = SuccBIIter->Count;
1284         if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) {
1285           ++SuccBIIter;
1286           continue;
1287         }
1288         TotalOutgoing += Count;
1289         IncomingMap[Succ] += Count;
1290         ++SuccBIIter;
1291       }
1292       OutgoingMap[&BB] = TotalOutgoing;
1293     }
1294 
1295     size_t NumBlocks = 0;
1296     double Mean = 0.0;
1297     for (const BinaryBasicBlock &BB : Function) {
1298       // Do not compute score for low frequency blocks, entry or exit blocks
1299       if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint())
1300         continue;
1301       ++NumBlocks;
1302       const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
1303       Mean += fabs(Difference / IncomingMap[&BB]);
1304     }
1305 
1306     FlowImbalanceMean += Mean;
1307     NumBlocksConsidered += NumBlocks;
1308     if (!NumBlocks)
1309       continue;
1310     double FuncMean = Mean / NumBlocks;
1311     if (FuncMean > WorstBias) {
1312       WorstBias = FuncMean;
1313       WorstBiasFunc = &Function;
1314     }
1315   }
1316   if (NumBlocksConsidered > 0)
1317     FlowImbalanceMean /= NumBlocksConsidered;
1318 
1319   // Compute standard deviation
1320   NumBlocksConsidered = 0;
1321   double FlowImbalanceVar = 0.0;
1322   for (const auto &BFI : BC.getBinaryFunctions()) {
1323     const BinaryFunction &Function = BFI.second;
1324     if (Function.empty() || !Function.isSimple())
1325       continue;
1326     FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
1327     FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
1328     for (const BinaryBasicBlock &BB : Function) {
1329       if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0)
1330         continue;
1331       ++NumBlocksConsidered;
1332       const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
1333       FlowImbalanceVar +=
1334           pow(fabs(Difference / IncomingMap[&BB]) - FlowImbalanceMean, 2);
1335     }
1336   }
1337   if (NumBlocksConsidered) {
1338     FlowImbalanceVar /= NumBlocksConsidered;
1339     FlowImbalanceVar = sqrt(FlowImbalanceVar);
1340   }
1341 
1342   // Report to user
1343   outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n",
1344                    (100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar));
1345   if (WorstBiasFunc && opts::Verbosity >= 1) {
1346     outs() << "Worst average bias observed in " << WorstBiasFunc->getPrintName()
1347            << "\n";
1348     LLVM_DEBUG(WorstBiasFunc->dump());
1349   }
1350 }
1351 
1352 void PrintProgramStats::runOnFunctions(BinaryContext &BC) {
1353   uint64_t NumRegularFunctions = 0;
1354   uint64_t NumStaleProfileFunctions = 0;
1355   uint64_t NumAllStaleFunctions = 0;
1356   uint64_t NumInferredFunctions = 0;
1357   uint64_t NumNonSimpleProfiledFunctions = 0;
1358   uint64_t NumUnknownControlFlowFunctions = 0;
1359   uint64_t TotalSampleCount = 0;
1360   uint64_t StaleSampleCount = 0;
1361   uint64_t InferredSampleCount = 0;
1362   std::vector<const BinaryFunction *> ProfiledFunctions;
1363   const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n";
1364   for (auto &BFI : BC.getBinaryFunctions()) {
1365     const BinaryFunction &Function = BFI.second;
1366 
1367     // Ignore PLT functions for stats.
1368     if (Function.isPLTFunction())
1369       continue;
1370 
1371     ++NumRegularFunctions;
1372 
1373     if (!Function.isSimple()) {
1374       if (Function.hasProfile())
1375         ++NumNonSimpleProfiledFunctions;
1376       continue;
1377     }
1378 
1379     if (Function.hasUnknownControlFlow()) {
1380       if (opts::PrintUnknownCFG)
1381         Function.dump();
1382       else if (opts::PrintUnknown)
1383         errs() << "function with unknown control flow: " << Function << '\n';
1384 
1385       ++NumUnknownControlFlowFunctions;
1386     }
1387 
1388     if (!Function.hasProfile())
1389       continue;
1390 
1391     uint64_t SampleCount = Function.getRawBranchCount();
1392     TotalSampleCount += SampleCount;
1393 
1394     if (Function.hasValidProfile()) {
1395       ProfiledFunctions.push_back(&Function);
1396       if (Function.hasInferredProfile()) {
1397         ++NumInferredFunctions;
1398         InferredSampleCount += SampleCount;
1399         ++NumAllStaleFunctions;
1400       }
1401     } else {
1402       if (opts::ReportStaleFuncs) {
1403         outs() << StaleFuncsHeader;
1404         StaleFuncsHeader = "";
1405         outs() << "  " << Function << '\n';
1406       }
1407       ++NumStaleProfileFunctions;
1408       StaleSampleCount += SampleCount;
1409       ++NumAllStaleFunctions;
1410     }
1411   }
1412   BC.NumProfiledFuncs = ProfiledFunctions.size();
1413   BC.NumStaleProfileFuncs = NumStaleProfileFunctions;
1414 
1415   const size_t NumAllProfiledFunctions =
1416       ProfiledFunctions.size() + NumStaleProfileFunctions;
1417   outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of "
1418          << NumRegularFunctions << " functions in the binary ("
1419          << format("%.1f", NumAllProfiledFunctions /
1420                                (float)NumRegularFunctions * 100.0f)
1421          << "%) have non-empty execution profile\n";
1422   if (NumNonSimpleProfiledFunctions) {
1423     outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function"
1424            << (NumNonSimpleProfiledFunctions == 1 ? "" : "s")
1425            << " with profile could not be optimized\n";
1426   }
1427   if (NumStaleProfileFunctions) {
1428     const float PctStale =
1429         NumStaleProfileFunctions / (float)NumAllProfiledFunctions * 100.0f;
1430     auto printErrorOrWarning = [&]() {
1431       if (PctStale > opts::StaleThreshold)
1432         errs() << "BOLT-ERROR: ";
1433       else
1434         errs() << "BOLT-WARNING: ";
1435     };
1436     printErrorOrWarning();
1437     errs() << NumStaleProfileFunctions
1438            << format(" (%.1f%% of all profiled)", PctStale) << " function"
1439            << (NumStaleProfileFunctions == 1 ? "" : "s")
1440            << " have invalid (possibly stale) profile."
1441               " Use -report-stale to see the list.\n";
1442     if (TotalSampleCount > 0) {
1443       printErrorOrWarning();
1444       errs() << StaleSampleCount << " out of " << TotalSampleCount
1445              << " samples in the binary ("
1446              << format("%.1f", ((100.0f * StaleSampleCount) / TotalSampleCount))
1447              << "%) belong to functions with invalid"
1448                 " (possibly stale) profile.\n";
1449     }
1450     if (PctStale > opts::StaleThreshold) {
1451       errs() << "BOLT-ERROR: stale functions exceed specified threshold of "
1452              << opts::StaleThreshold << "%. Exiting.\n";
1453       exit(1);
1454     }
1455   }
1456   if (NumInferredFunctions) {
1457     outs() << format("BOLT-INFO: inferred profile for %d (%.2f%% of profiled, "
1458                      "%.2f%% of stale) functions responsible for %.2f%% samples"
1459                      " (%zu out of %zu)\n",
1460                      NumInferredFunctions,
1461                      100.0 * NumInferredFunctions / NumAllProfiledFunctions,
1462                      100.0 * NumInferredFunctions / NumAllStaleFunctions,
1463                      100.0 * InferredSampleCount / TotalSampleCount,
1464                      InferredSampleCount, TotalSampleCount);
1465     outs() << format(
1466         "BOLT-INFO: inference found an exact match for %.2f%% of basic blocks"
1467         " (%zu out of %zu stale) responsible for %.2f%% samples"
1468         " (%zu out of %zu stale)\n",
1469         100.0 * BC.Stats.NumMatchedBlocks / BC.Stats.NumStaleBlocks,
1470         BC.Stats.NumMatchedBlocks, BC.Stats.NumStaleBlocks,
1471         100.0 * BC.Stats.MatchedSampleCount / BC.Stats.StaleSampleCount,
1472         BC.Stats.MatchedSampleCount, BC.Stats.StaleSampleCount);
1473   }
1474 
1475   if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) {
1476     outs() << "BOLT-INFO: profile for " << NumUnusedObjects
1477            << " objects was ignored\n";
1478   }
1479 
1480   if (ProfiledFunctions.size() > 10) {
1481     if (opts::Verbosity >= 1) {
1482       outs() << "BOLT-INFO: top called functions are:\n";
1483       llvm::sort(ProfiledFunctions,
1484                  [](const BinaryFunction *A, const BinaryFunction *B) {
1485                    return B->getExecutionCount() < A->getExecutionCount();
1486                  });
1487       auto SFI = ProfiledFunctions.begin();
1488       auto SFIend = ProfiledFunctions.end();
1489       for (unsigned I = 0u; I < opts::TopCalledLimit && SFI != SFIend;
1490            ++SFI, ++I)
1491         outs() << "  " << **SFI << " : " << (*SFI)->getExecutionCount() << '\n';
1492     }
1493   }
1494 
1495   if (!opts::PrintSortedBy.empty()) {
1496     std::vector<BinaryFunction *> Functions;
1497     std::map<const BinaryFunction *, DynoStats> Stats;
1498 
1499     for (auto &BFI : BC.getBinaryFunctions()) {
1500       BinaryFunction &BF = BFI.second;
1501       if (shouldOptimize(BF) && BF.hasValidProfile()) {
1502         Functions.push_back(&BF);
1503         Stats.emplace(&BF, getDynoStats(BF));
1504       }
1505     }
1506 
1507     const bool SortAll =
1508         llvm::is_contained(opts::PrintSortedBy, DynoStats::LAST_DYNO_STAT);
1509 
1510     const bool Ascending =
1511         opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending;
1512 
1513     if (SortAll) {
1514       llvm::stable_sort(Functions,
1515                         [Ascending, &Stats](const BinaryFunction *A,
1516                                             const BinaryFunction *B) {
1517                           return Ascending ? Stats.at(A) < Stats.at(B)
1518                                            : Stats.at(B) < Stats.at(A);
1519                         });
1520     } else {
1521       llvm::stable_sort(
1522           Functions, [Ascending, &Stats](const BinaryFunction *A,
1523                                          const BinaryFunction *B) {
1524             const DynoStats &StatsA = Stats.at(A);
1525             const DynoStats &StatsB = Stats.at(B);
1526             return Ascending ? StatsA.lessThan(StatsB, opts::PrintSortedBy)
1527                              : StatsB.lessThan(StatsA, opts::PrintSortedBy);
1528           });
1529     }
1530 
1531     outs() << "BOLT-INFO: top functions sorted by ";
1532     if (SortAll) {
1533       outs() << "dyno stats";
1534     } else {
1535       outs() << "(";
1536       bool PrintComma = false;
1537       for (const DynoStats::Category Category : opts::PrintSortedBy) {
1538         if (PrintComma)
1539           outs() << ", ";
1540         outs() << DynoStats::Description(Category);
1541         PrintComma = true;
1542       }
1543       outs() << ")";
1544     }
1545 
1546     outs() << " are:\n";
1547     auto SFI = Functions.begin();
1548     for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) {
1549       const DynoStats Stats = getDynoStats(**SFI);
1550       outs() << "  " << **SFI;
1551       if (!SortAll) {
1552         outs() << " (";
1553         bool PrintComma = false;
1554         for (const DynoStats::Category Category : opts::PrintSortedBy) {
1555           if (PrintComma)
1556             outs() << ", ";
1557           outs() << dynoStatsOptName(Category) << "=" << Stats[Category];
1558           PrintComma = true;
1559         }
1560         outs() << ")";
1561       }
1562       outs() << "\n";
1563     }
1564   }
1565 
1566   if (!BC.TrappedFunctions.empty()) {
1567     errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function"
1568            << (BC.TrappedFunctions.size() > 1 ? "s" : "")
1569            << " will trap on entry. Use -trap-avx512=0 to disable"
1570               " traps.";
1571     if (opts::Verbosity >= 1 || BC.TrappedFunctions.size() <= 5) {
1572       errs() << '\n';
1573       for (const BinaryFunction *Function : BC.TrappedFunctions)
1574         errs() << "  " << *Function << '\n';
1575     } else {
1576       errs() << " Use -v=1 to see the list.\n";
1577     }
1578   }
1579 
1580   // Print information on missed macro-fusion opportunities seen on input.
1581   if (BC.Stats.MissedMacroFusionPairs) {
1582     outs() << format("BOLT-INFO: the input contains %zu (dynamic count : %zu)"
1583                      " opportunities for macro-fusion optimization",
1584                      BC.Stats.MissedMacroFusionPairs,
1585                      BC.Stats.MissedMacroFusionExecCount);
1586     switch (opts::AlignMacroOpFusion) {
1587     case MFT_NONE:
1588       outs() << ". Use -align-macro-fusion to fix.\n";
1589       break;
1590     case MFT_HOT:
1591       outs() << ". Will fix instances on a hot path.\n";
1592       break;
1593     case MFT_ALL:
1594       outs() << " that are going to be fixed\n";
1595       break;
1596     }
1597   }
1598 
1599   // Collect and print information about suboptimal code layout on input.
1600   if (opts::ReportBadLayout) {
1601     std::vector<BinaryFunction *> SuboptimalFuncs;
1602     for (auto &BFI : BC.getBinaryFunctions()) {
1603       BinaryFunction &BF = BFI.second;
1604       if (!BF.hasValidProfile())
1605         continue;
1606 
1607       const uint64_t HotThreshold =
1608           std::max<uint64_t>(BF.getKnownExecutionCount(), 1);
1609       bool HotSeen = false;
1610       for (const BinaryBasicBlock *BB : BF.getLayout().rblocks()) {
1611         if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) {
1612           HotSeen = true;
1613           continue;
1614         }
1615         if (HotSeen && BB->getKnownExecutionCount() == 0) {
1616           SuboptimalFuncs.push_back(&BF);
1617           break;
1618         }
1619       }
1620     }
1621 
1622     if (!SuboptimalFuncs.empty()) {
1623       llvm::sort(SuboptimalFuncs,
1624                  [](const BinaryFunction *A, const BinaryFunction *B) {
1625                    return A->getKnownExecutionCount() / A->getSize() >
1626                           B->getKnownExecutionCount() / B->getSize();
1627                  });
1628 
1629       outs() << "BOLT-INFO: " << SuboptimalFuncs.size()
1630              << " functions have "
1631                 "cold code in the middle of hot code. Top functions are:\n";
1632       for (unsigned I = 0;
1633            I < std::min(static_cast<size_t>(opts::ReportBadLayout),
1634                         SuboptimalFuncs.size());
1635            ++I)
1636         SuboptimalFuncs[I]->print(outs());
1637     }
1638   }
1639 
1640   if (NumUnknownControlFlowFunctions) {
1641     outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions
1642            << " functions have instructions with unknown control flow";
1643     if (!opts::PrintUnknown)
1644       outs() << ". Use -print-unknown to see the list.";
1645     outs() << '\n';
1646   }
1647 }
1648 
1649 void InstructionLowering::runOnFunctions(BinaryContext &BC) {
1650   for (auto &BFI : BC.getBinaryFunctions())
1651     for (BinaryBasicBlock &BB : BFI.second)
1652       for (MCInst &Instruction : BB)
1653         BC.MIB->lowerTailCall(Instruction);
1654 }
1655 
1656 void StripRepRet::runOnFunctions(BinaryContext &BC) {
1657   if (!BC.isX86())
1658     return;
1659 
1660   uint64_t NumPrefixesRemoved = 0;
1661   uint64_t NumBytesSaved = 0;
1662   for (auto &BFI : BC.getBinaryFunctions()) {
1663     for (BinaryBasicBlock &BB : BFI.second) {
1664       auto LastInstRIter = BB.getLastNonPseudo();
1665       if (LastInstRIter == BB.rend() || !BC.MIB->isReturn(*LastInstRIter) ||
1666           !BC.MIB->deleteREPPrefix(*LastInstRIter))
1667         continue;
1668 
1669       NumPrefixesRemoved += BB.getKnownExecutionCount();
1670       ++NumBytesSaved;
1671     }
1672   }
1673 
1674   if (NumBytesSaved)
1675     outs() << "BOLT-INFO: removed " << NumBytesSaved
1676            << " 'repz' prefixes"
1677               " with estimated execution count of "
1678            << NumPrefixesRemoved << " times.\n";
1679 }
1680 
1681 void InlineMemcpy::runOnFunctions(BinaryContext &BC) {
1682   if (!BC.isX86())
1683     return;
1684 
1685   uint64_t NumInlined = 0;
1686   uint64_t NumInlinedDyno = 0;
1687   for (auto &BFI : BC.getBinaryFunctions()) {
1688     for (BinaryBasicBlock &BB : BFI.second) {
1689       for (auto II = BB.begin(); II != BB.end(); ++II) {
1690         MCInst &Inst = *II;
1691 
1692         if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
1693             !Inst.getOperand(0).isExpr())
1694           continue;
1695 
1696         const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
1697         if (CalleeSymbol->getName() != "memcpy" &&
1698             CalleeSymbol->getName() != "memcpy@PLT" &&
1699             CalleeSymbol->getName() != "_memcpy8")
1700           continue;
1701 
1702         const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
1703         const bool IsTailCall = BC.MIB->isTailCall(Inst);
1704 
1705         const InstructionListType NewCode =
1706             BC.MIB->createInlineMemcpy(IsMemcpy8);
1707         II = BB.replaceInstruction(II, NewCode);
1708         std::advance(II, NewCode.size() - 1);
1709         if (IsTailCall) {
1710           MCInst Return;
1711           BC.MIB->createReturn(Return);
1712           II = BB.insertInstruction(std::next(II), std::move(Return));
1713         }
1714 
1715         ++NumInlined;
1716         NumInlinedDyno += BB.getKnownExecutionCount();
1717       }
1718     }
1719   }
1720 
1721   if (NumInlined) {
1722     outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls";
1723     if (NumInlinedDyno)
1724       outs() << ". The calls were executed " << NumInlinedDyno
1725              << " times based on profile.";
1726     outs() << '\n';
1727   }
1728 }
1729 
1730 bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const {
1731   if (!BinaryFunctionPass::shouldOptimize(Function))
1732     return false;
1733 
1734   for (const std::string &FunctionSpec : Spec) {
1735     StringRef FunctionName = StringRef(FunctionSpec).split(':').first;
1736     if (Function.hasNameRegex(FunctionName))
1737       return true;
1738   }
1739 
1740   return false;
1741 }
1742 
1743 std::set<size_t> SpecializeMemcpy1::getCallSitesToOptimize(
1744     const BinaryFunction &Function) const {
1745   StringRef SitesString;
1746   for (const std::string &FunctionSpec : Spec) {
1747     StringRef FunctionName;
1748     std::tie(FunctionName, SitesString) = StringRef(FunctionSpec).split(':');
1749     if (Function.hasNameRegex(FunctionName))
1750       break;
1751     SitesString = "";
1752   }
1753 
1754   std::set<size_t> Sites;
1755   SmallVector<StringRef, 4> SitesVec;
1756   SitesString.split(SitesVec, ':');
1757   for (StringRef SiteString : SitesVec) {
1758     if (SiteString.empty())
1759       continue;
1760     size_t Result;
1761     if (!SiteString.getAsInteger(10, Result))
1762       Sites.emplace(Result);
1763   }
1764 
1765   return Sites;
1766 }
1767 
1768 void SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) {
1769   if (!BC.isX86())
1770     return;
1771 
1772   uint64_t NumSpecialized = 0;
1773   uint64_t NumSpecializedDyno = 0;
1774   for (auto &BFI : BC.getBinaryFunctions()) {
1775     BinaryFunction &Function = BFI.second;
1776     if (!shouldOptimize(Function))
1777       continue;
1778 
1779     std::set<size_t> CallsToOptimize = getCallSitesToOptimize(Function);
1780     auto shouldOptimize = [&](size_t N) {
1781       return CallsToOptimize.empty() || CallsToOptimize.count(N);
1782     };
1783 
1784     std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend());
1785     size_t CallSiteID = 0;
1786     for (BinaryBasicBlock *CurBB : Blocks) {
1787       for (auto II = CurBB->begin(); II != CurBB->end(); ++II) {
1788         MCInst &Inst = *II;
1789 
1790         if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
1791             !Inst.getOperand(0).isExpr())
1792           continue;
1793 
1794         const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
1795         if (CalleeSymbol->getName() != "memcpy" &&
1796             CalleeSymbol->getName() != "memcpy@PLT")
1797           continue;
1798 
1799         if (BC.MIB->isTailCall(Inst))
1800           continue;
1801 
1802         ++CallSiteID;
1803 
1804         if (!shouldOptimize(CallSiteID))
1805           continue;
1806 
1807         // Create a copy of a call to memcpy(dest, src, size).
1808         MCInst MemcpyInstr = Inst;
1809 
1810         BinaryBasicBlock *OneByteMemcpyBB = CurBB->splitAt(II);
1811 
1812         BinaryBasicBlock *NextBB = nullptr;
1813         if (OneByteMemcpyBB->getNumNonPseudos() > 1) {
1814           NextBB = OneByteMemcpyBB->splitAt(OneByteMemcpyBB->begin());
1815           NextBB->eraseInstruction(NextBB->begin());
1816         } else {
1817           NextBB = OneByteMemcpyBB->getSuccessor();
1818           OneByteMemcpyBB->eraseInstruction(OneByteMemcpyBB->begin());
1819           assert(NextBB && "unexpected call to memcpy() with no return");
1820         }
1821 
1822         BinaryBasicBlock *MemcpyBB = Function.addBasicBlock();
1823         MemcpyBB->setOffset(CurBB->getInputOffset());
1824         InstructionListType CmpJCC =
1825             BC.MIB->createCmpJE(BC.MIB->getIntArgRegister(2), 1,
1826                                 OneByteMemcpyBB->getLabel(), BC.Ctx.get());
1827         CurBB->addInstructions(CmpJCC);
1828         CurBB->addSuccessor(MemcpyBB);
1829 
1830         MemcpyBB->addInstruction(std::move(MemcpyInstr));
1831         MemcpyBB->addSuccessor(NextBB);
1832         MemcpyBB->setCFIState(NextBB->getCFIState());
1833         MemcpyBB->setExecutionCount(0);
1834 
1835         // To prevent the actual call from being moved to cold, we set its
1836         // execution count to 1.
1837         if (CurBB->getKnownExecutionCount() > 0)
1838           MemcpyBB->setExecutionCount(1);
1839 
1840         InstructionListType OneByteMemcpy = BC.MIB->createOneByteMemcpy();
1841         OneByteMemcpyBB->addInstructions(OneByteMemcpy);
1842 
1843         ++NumSpecialized;
1844         NumSpecializedDyno += CurBB->getKnownExecutionCount();
1845 
1846         CurBB = NextBB;
1847 
1848         // Note: we don't expect the next instruction to be a call to memcpy.
1849         II = CurBB->begin();
1850       }
1851     }
1852   }
1853 
1854   if (NumSpecialized) {
1855     outs() << "BOLT-INFO: specialized " << NumSpecialized
1856            << " memcpy() call sites for size 1";
1857     if (NumSpecializedDyno)
1858       outs() << ". The calls were executed " << NumSpecializedDyno
1859              << " times based on profile.";
1860     outs() << '\n';
1861   }
1862 }
1863 
1864 void RemoveNops::runOnFunction(BinaryFunction &BF) {
1865   const BinaryContext &BC = BF.getBinaryContext();
1866   for (BinaryBasicBlock &BB : BF) {
1867     for (int64_t I = BB.size() - 1; I >= 0; --I) {
1868       MCInst &Inst = BB.getInstructionAtIndex(I);
1869       if (BC.MIB->isNoop(Inst) && BC.MIB->hasAnnotation(Inst, "NOP"))
1870         BB.eraseInstructionAtIndex(I);
1871     }
1872   }
1873 }
1874 
1875 void RemoveNops::runOnFunctions(BinaryContext &BC) {
1876   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
1877     runOnFunction(BF);
1878   };
1879 
1880   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
1881     return BF.shouldPreserveNops();
1882   };
1883 
1884   ParallelUtilities::runOnEachFunction(
1885       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
1886       SkipFunc, "RemoveNops");
1887 }
1888 
1889 } // namespace bolt
1890 } // namespace llvm
1891