xref: /llvm-project/bolt/lib/Passes/BinaryPasses.cpp (revision 54b17fa4eefb7ccb00fba8189e92fac2d46cc481)
1 //===- bolt/Passes/BinaryPasses.cpp - Binary-level passes -----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements multiple passes for binary optimization and analysis.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "bolt/Passes/BinaryPasses.h"
14 #include "bolt/Core/FunctionLayout.h"
15 #include "bolt/Core/ParallelUtilities.h"
16 #include "bolt/Passes/ReorderAlgorithm.h"
17 #include "bolt/Passes/ReorderFunctions.h"
18 #include "llvm/Support/CommandLine.h"
19 #include <atomic>
20 #include <mutex>
21 #include <numeric>
22 #include <vector>
23 
24 #define DEBUG_TYPE "bolt-opts"
25 
26 using namespace llvm;
27 using namespace bolt;
28 
29 static const char *dynoStatsOptName(const bolt::DynoStats::Category C) {
30   assert(C > bolt::DynoStats::FIRST_DYNO_STAT &&
31          C < DynoStats::LAST_DYNO_STAT && "Unexpected dyno stat category.");
32 
33   static std::string OptNames[bolt::DynoStats::LAST_DYNO_STAT + 1];
34 
35   OptNames[C] = bolt::DynoStats::Description(C);
36 
37   std::replace(OptNames[C].begin(), OptNames[C].end(), ' ', '-');
38 
39   return OptNames[C].c_str();
40 }
41 
42 namespace opts {
43 
44 extern cl::OptionCategory BoltCategory;
45 extern cl::OptionCategory BoltOptCategory;
46 
47 extern cl::opt<bolt::MacroFusionType> AlignMacroOpFusion;
48 extern cl::opt<unsigned> Verbosity;
49 extern cl::opt<bool> EnableBAT;
50 extern cl::opt<unsigned> ExecutionCountThreshold;
51 extern cl::opt<bool> UpdateDebugSections;
52 extern cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions;
53 
54 enum DynoStatsSortOrder : char {
55   Ascending,
56   Descending
57 };
58 
59 static cl::opt<DynoStatsSortOrder> DynoStatsSortOrderOpt(
60     "print-sorted-by-order",
61     cl::desc("use ascending or descending order when printing functions "
62              "ordered by dyno stats"),
63     cl::init(DynoStatsSortOrder::Descending), cl::cat(BoltOptCategory));
64 
65 cl::list<std::string>
66 HotTextMoveSections("hot-text-move-sections",
67   cl::desc("list of sections containing functions used for hugifying hot text. "
68            "BOLT makes sure these functions are not placed on the same page as "
69            "the hot text. (default=\'.stub,.mover\')."),
70   cl::value_desc("sec1,sec2,sec3,..."),
71   cl::CommaSeparated,
72   cl::ZeroOrMore,
73   cl::cat(BoltCategory));
74 
75 bool isHotTextMover(const BinaryFunction &Function) {
76   for (std::string &SectionName : opts::HotTextMoveSections) {
77     if (Function.getOriginSectionName() &&
78         *Function.getOriginSectionName() == SectionName)
79       return true;
80   }
81 
82   return false;
83 }
84 
85 static cl::opt<bool> MinBranchClusters(
86     "min-branch-clusters",
87     cl::desc("use a modified clustering algorithm geared towards minimizing "
88              "branches"),
89     cl::Hidden, cl::cat(BoltOptCategory));
90 
91 static cl::list<Peepholes::PeepholeOpts> Peepholes(
92     "peepholes", cl::CommaSeparated, cl::desc("enable peephole optimizations"),
93     cl::value_desc("opt1,opt2,opt3,..."),
94     cl::values(clEnumValN(Peepholes::PEEP_NONE, "none", "disable peepholes"),
95                clEnumValN(Peepholes::PEEP_DOUBLE_JUMPS, "double-jumps",
96                           "remove double jumps when able"),
97                clEnumValN(Peepholes::PEEP_TAILCALL_TRAPS, "tailcall-traps",
98                           "insert tail call traps"),
99                clEnumValN(Peepholes::PEEP_USELESS_BRANCHES, "useless-branches",
100                           "remove useless conditional branches"),
101                clEnumValN(Peepholes::PEEP_ALL, "all",
102                           "enable all peephole optimizations")),
103     cl::ZeroOrMore, cl::cat(BoltOptCategory));
104 
105 static cl::opt<unsigned>
106     PrintFuncStat("print-function-statistics",
107                   cl::desc("print statistics about basic block ordering"),
108                   cl::init(0), cl::cat(BoltOptCategory));
109 
110 static cl::opt<bool> PrintLargeFunctions(
111     "print-large-functions",
112     cl::desc("print functions that could not be overwritten due to excessive "
113              "size"),
114     cl::init(false), cl::cat(BoltOptCategory));
115 
116 static cl::list<bolt::DynoStats::Category>
117     PrintSortedBy("print-sorted-by", cl::CommaSeparated,
118                   cl::desc("print functions sorted by order of dyno stats"),
119                   cl::value_desc("key1,key2,key3,..."),
120                   cl::values(
121 #define D(name, description, ...)                                              \
122   clEnumValN(bolt::DynoStats::name, dynoStatsOptName(bolt::DynoStats::name),   \
123              description),
124                       REAL_DYNO_STATS
125 #undef D
126                           clEnumValN(bolt::DynoStats::LAST_DYNO_STAT, "all",
127                                      "sorted by all names")),
128                   cl::ZeroOrMore, cl::cat(BoltOptCategory));
129 
130 static cl::opt<bool>
131     PrintUnknown("print-unknown",
132                  cl::desc("print names of functions with unknown control flow"),
133                  cl::cat(BoltCategory), cl::Hidden);
134 
135 static cl::opt<bool>
136     PrintUnknownCFG("print-unknown-cfg",
137                     cl::desc("dump CFG of functions with unknown control flow"),
138                     cl::cat(BoltCategory), cl::ReallyHidden);
139 
140 // Please MSVC19 with a forward declaration: otherwise it reports an error about
141 // an undeclared variable inside a callback.
142 extern cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks;
143 cl::opt<bolt::ReorderBasicBlocks::LayoutType> ReorderBlocks(
144     "reorder-blocks", cl::desc("change layout of basic blocks in a function"),
145     cl::init(bolt::ReorderBasicBlocks::LT_NONE),
146     cl::values(
147         clEnumValN(bolt::ReorderBasicBlocks::LT_NONE, "none",
148                    "do not reorder basic blocks"),
149         clEnumValN(bolt::ReorderBasicBlocks::LT_REVERSE, "reverse",
150                    "layout blocks in reverse order"),
151         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE, "normal",
152                    "perform optimal layout based on profile"),
153         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_BRANCH,
154                    "branch-predictor",
155                    "perform optimal layout prioritizing branch "
156                    "predictions"),
157         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE, "cache",
158                    "perform optimal layout prioritizing I-cache "
159                    "behavior"),
160         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS, "cache+",
161                    "perform layout optimizing I-cache behavior"),
162         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP, "ext-tsp",
163                    "perform layout optimizing I-cache behavior"),
164         clEnumValN(bolt::ReorderBasicBlocks::LT_OPTIMIZE_SHUFFLE,
165                    "cluster-shuffle", "perform random layout of clusters")),
166     cl::ZeroOrMore, cl::cat(BoltOptCategory),
167     cl::callback([](const bolt::ReorderBasicBlocks::LayoutType &option) {
168       if (option == bolt::ReorderBasicBlocks::LT_OPTIMIZE_CACHE_PLUS) {
169         errs() << "BOLT-WARNING: '-reorder-blocks=cache+' is deprecated, please"
170                << " use '-reorder-blocks=ext-tsp' instead\n";
171         ReorderBlocks = bolt::ReorderBasicBlocks::LT_OPTIMIZE_EXT_TSP;
172       }
173     }));
174 
175 static cl::opt<unsigned> ReportBadLayout(
176     "report-bad-layout",
177     cl::desc("print top <uint> functions with suboptimal code layout on input"),
178     cl::init(0), cl::Hidden, cl::cat(BoltOptCategory));
179 
180 static cl::opt<bool>
181     ReportStaleFuncs("report-stale",
182                      cl::desc("print the list of functions with stale profile"),
183                      cl::Hidden, cl::cat(BoltOptCategory));
184 
185 enum SctcModes : char {
186   SctcAlways,
187   SctcPreserveDirection,
188   SctcHeuristic
189 };
190 
191 static cl::opt<SctcModes>
192 SctcMode("sctc-mode",
193   cl::desc("mode for simplify conditional tail calls"),
194   cl::init(SctcAlways),
195   cl::values(clEnumValN(SctcAlways, "always", "always perform sctc"),
196     clEnumValN(SctcPreserveDirection,
197       "preserve",
198       "only perform sctc when branch direction is "
199       "preserved"),
200     clEnumValN(SctcHeuristic,
201       "heuristic",
202       "use branch prediction data to control sctc")),
203   cl::ZeroOrMore,
204   cl::cat(BoltOptCategory));
205 
206 static cl::opt<unsigned>
207 StaleThreshold("stale-threshold",
208     cl::desc(
209       "maximum percentage of stale functions to tolerate (default: 100)"),
210     cl::init(100),
211     cl::Hidden,
212     cl::cat(BoltOptCategory));
213 
214 static cl::opt<unsigned> TSPThreshold(
215     "tsp-threshold",
216     cl::desc(
217         "maximum number of hot basic blocks in a function for which to use "
218         "a precise TSP solution while re-ordering basic blocks"),
219     cl::init(10), cl::Hidden, cl::cat(BoltOptCategory));
220 
221 static cl::opt<unsigned> TopCalledLimit(
222     "top-called-limit",
223     cl::desc("maximum number of functions to print in top called "
224              "functions section"),
225     cl::init(100), cl::Hidden, cl::cat(BoltCategory));
226 
227 } // namespace opts
228 
229 namespace llvm {
230 namespace bolt {
231 
232 bool BinaryFunctionPass::shouldOptimize(const BinaryFunction &BF) const {
233   return BF.isSimple() && BF.getState() == BinaryFunction::State::CFG &&
234          !BF.isIgnored();
235 }
236 
237 bool BinaryFunctionPass::shouldPrint(const BinaryFunction &BF) const {
238   return BF.isSimple() && !BF.isIgnored();
239 }
240 
241 void NormalizeCFG::runOnFunction(BinaryFunction &BF) {
242   uint64_t NumRemoved = 0;
243   uint64_t NumDuplicateEdges = 0;
244   uint64_t NeedsFixBranches = 0;
245   for (BinaryBasicBlock &BB : BF) {
246     if (!BB.empty())
247       continue;
248 
249     if (BB.isEntryPoint() || BB.isLandingPad())
250       continue;
251 
252     // Handle a dangling empty block.
253     if (BB.succ_size() == 0) {
254       // If an empty dangling basic block has a predecessor, it could be a
255       // result of codegen for __builtin_unreachable. In such case, do not
256       // remove the block.
257       if (BB.pred_size() == 0) {
258         BB.markValid(false);
259         ++NumRemoved;
260       }
261       continue;
262     }
263 
264     // The block should have just one successor.
265     BinaryBasicBlock *Successor = BB.getSuccessor();
266     assert(Successor && "invalid CFG encountered");
267 
268     // Redirect all predecessors to the successor block.
269     while (!BB.pred_empty()) {
270       BinaryBasicBlock *Predecessor = *BB.pred_begin();
271       if (Predecessor->hasJumpTable())
272         break;
273 
274       if (Predecessor == Successor)
275         break;
276 
277       BinaryBasicBlock::BinaryBranchInfo &BI = Predecessor->getBranchInfo(BB);
278       Predecessor->replaceSuccessor(&BB, Successor, BI.Count,
279                                     BI.MispredictedCount);
280       // We need to fix branches even if we failed to replace all successors
281       // and remove the block.
282       NeedsFixBranches = true;
283     }
284 
285     if (BB.pred_empty()) {
286       BB.removeAllSuccessors();
287       BB.markValid(false);
288       ++NumRemoved;
289     }
290   }
291 
292   if (NumRemoved)
293     BF.eraseInvalidBBs();
294 
295   // Check for duplicate successors. Do it after the empty block elimination as
296   // we can get more duplicate successors.
297   for (BinaryBasicBlock &BB : BF)
298     if (!BB.hasJumpTable() && BB.succ_size() == 2 &&
299         BB.getConditionalSuccessor(false) == BB.getConditionalSuccessor(true))
300       ++NumDuplicateEdges;
301 
302   // fixBranches() will get rid of duplicate edges and update jump instructions.
303   if (NumDuplicateEdges || NeedsFixBranches)
304     BF.fixBranches();
305 
306   NumDuplicateEdgesMerged += NumDuplicateEdges;
307   NumBlocksRemoved += NumRemoved;
308 }
309 
310 Error NormalizeCFG::runOnFunctions(BinaryContext &BC) {
311   ParallelUtilities::runOnEachFunction(
312       BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR,
313       [&](BinaryFunction &BF) { runOnFunction(BF); },
314       [&](const BinaryFunction &BF) { return !shouldOptimize(BF); },
315       "NormalizeCFG");
316   if (NumBlocksRemoved)
317     BC.outs() << "BOLT-INFO: removed " << NumBlocksRemoved << " empty block"
318               << (NumBlocksRemoved == 1 ? "" : "s") << '\n';
319   if (NumDuplicateEdgesMerged)
320     BC.outs() << "BOLT-INFO: merged " << NumDuplicateEdgesMerged
321               << " duplicate CFG edge"
322               << (NumDuplicateEdgesMerged == 1 ? "" : "s") << '\n';
323   return Error::success();
324 }
325 
326 void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) {
327   BinaryContext &BC = Function.getBinaryContext();
328   unsigned Count;
329   uint64_t Bytes;
330   Function.markUnreachableBlocks();
331   LLVM_DEBUG({
332     for (BinaryBasicBlock &BB : Function) {
333       if (!BB.isValid()) {
334         dbgs() << "BOLT-INFO: UCE found unreachable block " << BB.getName()
335                << " in function " << Function << "\n";
336         Function.dump();
337       }
338     }
339   });
340   BinaryContext::IndependentCodeEmitter Emitter =
341       BC.createIndependentMCCodeEmitter();
342   std::tie(Count, Bytes) = Function.eraseInvalidBBs(Emitter.MCE.get());
343   DeletedBlocks += Count;
344   DeletedBytes += Bytes;
345   if (Count) {
346     auto L = BC.scopeLock();
347     Modified.insert(&Function);
348     if (opts::Verbosity > 0)
349       BC.outs() << "BOLT-INFO: removed " << Count
350                 << " dead basic block(s) accounting for " << Bytes
351                 << " bytes in function " << Function << '\n';
352   }
353 }
354 
355 Error EliminateUnreachableBlocks::runOnFunctions(BinaryContext &BC) {
356   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
357     runOnFunction(BF);
358   };
359 
360   ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
361     return !shouldOptimize(BF) || BF.getLayout().block_empty();
362   };
363 
364   ParallelUtilities::runOnEachFunction(
365       BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun,
366       SkipPredicate, "elimininate-unreachable");
367 
368   if (DeletedBlocks)
369     BC.outs() << "BOLT-INFO: UCE removed " << DeletedBlocks << " blocks and "
370               << DeletedBytes << " bytes of code\n";
371   return Error::success();
372 }
373 
374 bool ReorderBasicBlocks::shouldPrint(const BinaryFunction &BF) const {
375   return (BinaryFunctionPass::shouldPrint(BF) &&
376           opts::ReorderBlocks != ReorderBasicBlocks::LT_NONE);
377 }
378 
379 bool ReorderBasicBlocks::shouldOptimize(const BinaryFunction &BF) const {
380   // Apply execution count threshold
381   if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
382     return false;
383 
384   return BinaryFunctionPass::shouldOptimize(BF);
385 }
386 
387 Error ReorderBasicBlocks::runOnFunctions(BinaryContext &BC) {
388   if (opts::ReorderBlocks == ReorderBasicBlocks::LT_NONE)
389     return Error::success();
390 
391   std::atomic_uint64_t ModifiedFuncCount(0);
392   std::mutex FunctionEditDistanceMutex;
393   DenseMap<const BinaryFunction *, uint64_t> FunctionEditDistance;
394 
395   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
396     SmallVector<const BinaryBasicBlock *, 0> OldBlockOrder;
397     if (opts::PrintFuncStat > 0)
398       llvm::copy(BF.getLayout().blocks(), std::back_inserter(OldBlockOrder));
399 
400     const bool LayoutChanged =
401         modifyFunctionLayout(BF, opts::ReorderBlocks, opts::MinBranchClusters);
402     if (LayoutChanged) {
403       ModifiedFuncCount.fetch_add(1, std::memory_order_relaxed);
404       if (opts::PrintFuncStat > 0) {
405         const uint64_t Distance = BF.getLayout().getEditDistance(OldBlockOrder);
406         std::lock_guard<std::mutex> Lock(FunctionEditDistanceMutex);
407         FunctionEditDistance[&BF] = Distance;
408       }
409     }
410   };
411 
412   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
413     return !shouldOptimize(BF);
414   };
415 
416   ParallelUtilities::runOnEachFunction(
417       BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc,
418       "ReorderBasicBlocks");
419   const size_t NumAllProfiledFunctions =
420       BC.NumProfiledFuncs + BC.NumStaleProfileFuncs;
421 
422   BC.outs() << "BOLT-INFO: basic block reordering modified layout of "
423             << format(
424                    "%zu functions (%.2lf%% of profiled, %.2lf%% of total)\n",
425                    ModifiedFuncCount.load(std::memory_order_relaxed),
426                    100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) /
427                        NumAllProfiledFunctions,
428                    100.0 * ModifiedFuncCount.load(std::memory_order_relaxed) /
429                        BC.getBinaryFunctions().size());
430 
431   if (opts::PrintFuncStat > 0) {
432     raw_ostream &OS = BC.outs();
433     // Copy all the values into vector in order to sort them
434     std::map<uint64_t, BinaryFunction &> ScoreMap;
435     auto &BFs = BC.getBinaryFunctions();
436     for (auto It = BFs.begin(); It != BFs.end(); ++It)
437       ScoreMap.insert(std::pair<uint64_t, BinaryFunction &>(
438           It->second.getFunctionScore(), It->second));
439 
440     OS << "\nBOLT-INFO: Printing Function Statistics:\n\n";
441     OS << "           There are " << BFs.size() << " functions in total. \n";
442     OS << "           Number of functions being modified: "
443        << ModifiedFuncCount.load(std::memory_order_relaxed) << "\n";
444     OS << "           User asks for detailed information on top "
445        << opts::PrintFuncStat << " functions. (Ranked by function score)"
446        << "\n\n";
447     uint64_t I = 0;
448     for (std::map<uint64_t, BinaryFunction &>::reverse_iterator Rit =
449              ScoreMap.rbegin();
450          Rit != ScoreMap.rend() && I < opts::PrintFuncStat; ++Rit, ++I) {
451       BinaryFunction &Function = Rit->second;
452 
453       OS << "           Information for function of top: " << (I + 1) << ": \n";
454       OS << "             Function Score is: " << Function.getFunctionScore()
455          << "\n";
456       OS << "             There are " << Function.size()
457          << " number of blocks in this function.\n";
458       OS << "             There are " << Function.getInstructionCount()
459          << " number of instructions in this function.\n";
460       OS << "             The edit distance for this function is: "
461          << FunctionEditDistance.lookup(&Function) << "\n\n";
462     }
463   }
464   return Error::success();
465 }
466 
467 bool ReorderBasicBlocks::modifyFunctionLayout(BinaryFunction &BF,
468                                               LayoutType Type,
469                                               bool MinBranchClusters) const {
470   if (BF.size() == 0 || Type == LT_NONE)
471     return false;
472 
473   BinaryFunction::BasicBlockOrderType NewLayout;
474   std::unique_ptr<ReorderAlgorithm> Algo;
475 
476   // Cannot do optimal layout without profile.
477   if (Type != LT_REVERSE && !BF.hasValidProfile())
478     return false;
479 
480   if (Type == LT_REVERSE) {
481     Algo.reset(new ReverseReorderAlgorithm());
482   } else if (BF.size() <= opts::TSPThreshold && Type != LT_OPTIMIZE_SHUFFLE) {
483     // Work on optimal solution if problem is small enough
484     LLVM_DEBUG(dbgs() << "finding optimal block layout for " << BF << "\n");
485     Algo.reset(new TSPReorderAlgorithm());
486   } else {
487     LLVM_DEBUG(dbgs() << "running block layout heuristics on " << BF << "\n");
488 
489     std::unique_ptr<ClusterAlgorithm> CAlgo;
490     if (MinBranchClusters)
491       CAlgo.reset(new MinBranchGreedyClusterAlgorithm());
492     else
493       CAlgo.reset(new PHGreedyClusterAlgorithm());
494 
495     switch (Type) {
496     case LT_OPTIMIZE:
497       Algo.reset(new OptimizeReorderAlgorithm(std::move(CAlgo)));
498       break;
499 
500     case LT_OPTIMIZE_BRANCH:
501       Algo.reset(new OptimizeBranchReorderAlgorithm(std::move(CAlgo)));
502       break;
503 
504     case LT_OPTIMIZE_CACHE:
505       Algo.reset(new OptimizeCacheReorderAlgorithm(std::move(CAlgo)));
506       break;
507 
508     case LT_OPTIMIZE_EXT_TSP:
509       Algo.reset(new ExtTSPReorderAlgorithm());
510       break;
511 
512     case LT_OPTIMIZE_SHUFFLE:
513       Algo.reset(new RandomClusterReorderAlgorithm(std::move(CAlgo)));
514       break;
515 
516     default:
517       llvm_unreachable("unexpected layout type");
518     }
519   }
520 
521   Algo->reorderBasicBlocks(BF, NewLayout);
522 
523   return BF.getLayout().update(NewLayout);
524 }
525 
526 Error FixupBranches::runOnFunctions(BinaryContext &BC) {
527   for (auto &It : BC.getBinaryFunctions()) {
528     BinaryFunction &Function = It.second;
529     if (!BC.shouldEmit(Function) || !Function.isSimple())
530       continue;
531 
532     Function.fixBranches();
533   }
534   return Error::success();
535 }
536 
537 Error FinalizeFunctions::runOnFunctions(BinaryContext &BC) {
538   std::atomic<bool> HasFatal{false};
539   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
540     if (!BF.finalizeCFIState()) {
541       if (BC.HasRelocations) {
542         BC.errs() << "BOLT-ERROR: unable to fix CFI state for function " << BF
543                   << ". Exiting.\n";
544         HasFatal = true;
545         return;
546       }
547       BF.setSimple(false);
548       return;
549     }
550 
551     BF.setFinalized();
552 
553     // Update exception handling information.
554     BF.updateEHRanges();
555   };
556 
557   ParallelUtilities::PredicateTy SkipPredicate = [&](const BinaryFunction &BF) {
558     return !BC.shouldEmit(BF);
559   };
560 
561   ParallelUtilities::runOnEachFunction(
562       BC, ParallelUtilities::SchedulingPolicy::SP_CONSTANT, WorkFun,
563       SkipPredicate, "FinalizeFunctions");
564   if (HasFatal)
565     return createFatalBOLTError("finalize CFI state failure");
566   return Error::success();
567 }
568 
569 Error CheckLargeFunctions::runOnFunctions(BinaryContext &BC) {
570   if (BC.HasRelocations)
571     return Error::success();
572 
573   // If the function wouldn't fit, mark it as non-simple. Otherwise, we may emit
574   // incorrect meta data.
575   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
576     uint64_t HotSize, ColdSize;
577     std::tie(HotSize, ColdSize) =
578         BC.calculateEmittedSize(BF, /*FixBranches=*/false);
579     if (HotSize > BF.getMaxSize()) {
580       if (opts::PrintLargeFunctions)
581         BC.outs() << "BOLT-INFO: " << BF << " size exceeds allocated space by "
582                   << (HotSize - BF.getMaxSize()) << " bytes\n";
583       BF.setSimple(false);
584     }
585   };
586 
587   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
588     return !shouldOptimize(BF);
589   };
590 
591   ParallelUtilities::runOnEachFunction(
592       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
593       SkipFunc, "CheckLargeFunctions");
594 
595   return Error::success();
596 }
597 
598 bool CheckLargeFunctions::shouldOptimize(const BinaryFunction &BF) const {
599   // Unlike other passes, allow functions in non-CFG state.
600   return BF.isSimple() && !BF.isIgnored();
601 }
602 
603 Error LowerAnnotations::runOnFunctions(BinaryContext &BC) {
604   // Convert GnuArgsSize annotations into CFIs.
605   for (BinaryFunction *BF : BC.getAllBinaryFunctions()) {
606     for (FunctionFragment &FF : BF->getLayout().fragments()) {
607       // Reset at the start of the new fragment.
608       int64_t CurrentGnuArgsSize = 0;
609 
610       for (BinaryBasicBlock *const BB : FF) {
611         for (auto II = BB->begin(); II != BB->end(); ++II) {
612           if (!BF->usesGnuArgsSize() || !BC.MIB->isInvoke(*II))
613             continue;
614 
615           const int64_t NewGnuArgsSize = BC.MIB->getGnuArgsSize(*II);
616           assert(NewGnuArgsSize >= 0 && "Expected non-negative GNU_args_size.");
617           if (NewGnuArgsSize == CurrentGnuArgsSize)
618             continue;
619 
620           auto InsertII = BF->addCFIInstruction(
621               BB, II,
622               MCCFIInstruction::createGnuArgsSize(nullptr, NewGnuArgsSize));
623           CurrentGnuArgsSize = NewGnuArgsSize;
624           II = std::next(InsertII);
625         }
626       }
627     }
628   }
629   return Error::success();
630 }
631 
632 // Check for dirty state in MCSymbol objects that might be a consequence
633 // of running calculateEmittedSize() in parallel, during split functions
634 // pass. If an inconsistent state is found (symbol already registered or
635 // already defined), clean it.
636 Error CleanMCState::runOnFunctions(BinaryContext &BC) {
637   MCContext &Ctx = *BC.Ctx;
638   for (const auto &SymMapEntry : Ctx.getSymbols()) {
639     const MCSymbol *S = SymMapEntry.second;
640     if (S->isDefined()) {
641       LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName()
642                         << "\" is already defined\n");
643       const_cast<MCSymbol *>(S)->setUndefined();
644     }
645     if (S->isRegistered()) {
646       LLVM_DEBUG(dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName()
647                         << "\" is already registered\n");
648       const_cast<MCSymbol *>(S)->setIsRegistered(false);
649     }
650     LLVM_DEBUG(if (S->isVariable()) {
651       dbgs() << "BOLT-DEBUG: Symbol \"" << S->getName() << "\" is variable\n";
652     });
653   }
654   return Error::success();
655 }
656 
657 // This peephole fixes jump instructions that jump to another basic
658 // block with a single jump instruction, e.g.
659 //
660 // B0: ...
661 //     jmp  B1   (or jcc B1)
662 //
663 // B1: jmp  B2
664 //
665 // ->
666 //
667 // B0: ...
668 //     jmp  B2   (or jcc B2)
669 //
670 static uint64_t fixDoubleJumps(BinaryFunction &Function, bool MarkInvalid) {
671   uint64_t NumDoubleJumps = 0;
672 
673   MCContext *Ctx = Function.getBinaryContext().Ctx.get();
674   MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
675   for (BinaryBasicBlock &BB : Function) {
676     auto checkAndPatch = [&](BinaryBasicBlock *Pred, BinaryBasicBlock *Succ,
677                              const MCSymbol *SuccSym) {
678       // Ignore infinite loop jumps or fallthrough tail jumps.
679       if (Pred == Succ || Succ == &BB)
680         return false;
681 
682       if (Succ) {
683         const MCSymbol *TBB = nullptr;
684         const MCSymbol *FBB = nullptr;
685         MCInst *CondBranch = nullptr;
686         MCInst *UncondBranch = nullptr;
687         bool Res = Pred->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
688         if (!Res) {
689           LLVM_DEBUG(dbgs() << "analyzeBranch failed in peepholes in block:\n";
690                      Pred->dump());
691           return false;
692         }
693         Pred->replaceSuccessor(&BB, Succ);
694 
695         // We must patch up any existing branch instructions to match up
696         // with the new successor.
697         assert((CondBranch || (!CondBranch && Pred->succ_size() == 1)) &&
698                "Predecessor block has inconsistent number of successors");
699         if (CondBranch && MIB->getTargetSymbol(*CondBranch) == BB.getLabel()) {
700           MIB->replaceBranchTarget(*CondBranch, Succ->getLabel(), Ctx);
701         } else if (UncondBranch &&
702                    MIB->getTargetSymbol(*UncondBranch) == BB.getLabel()) {
703           MIB->replaceBranchTarget(*UncondBranch, Succ->getLabel(), Ctx);
704         } else if (!UncondBranch) {
705           assert(Function.getLayout().getBasicBlockAfter(Pred, false) != Succ &&
706                  "Don't add an explicit jump to a fallthrough block.");
707           Pred->addBranchInstruction(Succ);
708         }
709       } else {
710         // Succ will be null in the tail call case.  In this case we
711         // need to explicitly add a tail call instruction.
712         MCInst *Branch = Pred->getLastNonPseudoInstr();
713         if (Branch && MIB->isUnconditionalBranch(*Branch)) {
714           assert(MIB->getTargetSymbol(*Branch) == BB.getLabel());
715           Pred->removeSuccessor(&BB);
716           Pred->eraseInstruction(Pred->findInstruction(Branch));
717           Pred->addTailCallInstruction(SuccSym);
718           MCInst *TailCall = Pred->getLastNonPseudoInstr();
719           assert(TailCall);
720           MIB->setOffset(*TailCall, BB.getOffset());
721         } else {
722           return false;
723         }
724       }
725 
726       ++NumDoubleJumps;
727       LLVM_DEBUG(dbgs() << "Removed double jump in " << Function << " from "
728                         << Pred->getName() << " -> " << BB.getName() << " to "
729                         << Pred->getName() << " -> " << SuccSym->getName()
730                         << (!Succ ? " (tail)\n" : "\n"));
731 
732       return true;
733     };
734 
735     if (BB.getNumNonPseudos() != 1 || BB.isLandingPad())
736       continue;
737 
738     MCInst *Inst = BB.getFirstNonPseudoInstr();
739     const bool IsTailCall = MIB->isTailCall(*Inst);
740 
741     if (!MIB->isUnconditionalBranch(*Inst) && !IsTailCall)
742       continue;
743 
744     // If we operate after SCTC make sure it's not a conditional tail call.
745     if (IsTailCall && MIB->isConditionalBranch(*Inst))
746       continue;
747 
748     const MCSymbol *SuccSym = MIB->getTargetSymbol(*Inst);
749     BinaryBasicBlock *Succ = BB.getSuccessor();
750 
751     if (((!Succ || &BB == Succ) && !IsTailCall) || (IsTailCall && !SuccSym))
752       continue;
753 
754     std::vector<BinaryBasicBlock *> Preds = {BB.pred_begin(), BB.pred_end()};
755 
756     for (BinaryBasicBlock *Pred : Preds) {
757       if (Pred->isLandingPad())
758         continue;
759 
760       if (Pred->getSuccessor() == &BB ||
761           (Pred->getConditionalSuccessor(true) == &BB && !IsTailCall) ||
762           Pred->getConditionalSuccessor(false) == &BB)
763         if (checkAndPatch(Pred, Succ, SuccSym) && MarkInvalid)
764           BB.markValid(BB.pred_size() != 0 || BB.isLandingPad() ||
765                        BB.isEntryPoint());
766     }
767   }
768 
769   return NumDoubleJumps;
770 }
771 
772 bool SimplifyConditionalTailCalls::shouldRewriteBranch(
773     const BinaryBasicBlock *PredBB, const MCInst &CondBranch,
774     const BinaryBasicBlock *BB, const bool DirectionFlag) {
775   if (BeenOptimized.count(PredBB))
776     return false;
777 
778   const bool IsForward = BinaryFunction::isForwardBranch(PredBB, BB);
779 
780   if (IsForward)
781     ++NumOrigForwardBranches;
782   else
783     ++NumOrigBackwardBranches;
784 
785   if (opts::SctcMode == opts::SctcAlways)
786     return true;
787 
788   if (opts::SctcMode == opts::SctcPreserveDirection)
789     return IsForward == DirectionFlag;
790 
791   const ErrorOr<std::pair<double, double>> Frequency =
792       PredBB->getBranchStats(BB);
793 
794   // It's ok to rewrite the conditional branch if the new target will be
795   // a backward branch.
796 
797   // If no data available for these branches, then it should be ok to
798   // do the optimization since it will reduce code size.
799   if (Frequency.getError())
800     return true;
801 
802   // TODO: should this use misprediction frequency instead?
803   const bool Result = (IsForward && Frequency.get().first >= 0.5) ||
804                       (!IsForward && Frequency.get().first <= 0.5);
805 
806   return Result == DirectionFlag;
807 }
808 
809 uint64_t SimplifyConditionalTailCalls::fixTailCalls(BinaryFunction &BF) {
810   // Need updated indices to correctly detect branch' direction.
811   BF.getLayout().updateLayoutIndices();
812   BF.markUnreachableBlocks();
813 
814   MCPlusBuilder *MIB = BF.getBinaryContext().MIB.get();
815   MCContext *Ctx = BF.getBinaryContext().Ctx.get();
816   uint64_t NumLocalCTCCandidates = 0;
817   uint64_t NumLocalCTCs = 0;
818   uint64_t LocalCTCTakenCount = 0;
819   uint64_t LocalCTCExecCount = 0;
820   std::vector<std::pair<BinaryBasicBlock *, const BinaryBasicBlock *>>
821       NeedsUncondBranch;
822 
823   // Will block be deleted by UCE?
824   auto isValid = [](const BinaryBasicBlock *BB) {
825     return (BB->pred_size() != 0 || BB->isLandingPad() || BB->isEntryPoint());
826   };
827 
828   for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
829     // Locate BB with a single direct tail-call instruction.
830     if (BB->getNumNonPseudos() != 1)
831       continue;
832 
833     MCInst *Instr = BB->getFirstNonPseudoInstr();
834     if (!MIB->isTailCall(*Instr) || MIB->isConditionalBranch(*Instr))
835       continue;
836 
837     const MCSymbol *CalleeSymbol = MIB->getTargetSymbol(*Instr);
838     if (!CalleeSymbol)
839       continue;
840 
841     // Detect direction of the possible conditional tail call.
842     const bool IsForwardCTC = BF.isForwardCall(CalleeSymbol);
843 
844     // Iterate through all predecessors.
845     for (BinaryBasicBlock *PredBB : BB->predecessors()) {
846       BinaryBasicBlock *CondSucc = PredBB->getConditionalSuccessor(true);
847       if (!CondSucc)
848         continue;
849 
850       ++NumLocalCTCCandidates;
851 
852       const MCSymbol *TBB = nullptr;
853       const MCSymbol *FBB = nullptr;
854       MCInst *CondBranch = nullptr;
855       MCInst *UncondBranch = nullptr;
856       bool Result = PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
857 
858       // analyzeBranch() can fail due to unusual branch instructions, e.g. jrcxz
859       if (!Result) {
860         LLVM_DEBUG(dbgs() << "analyzeBranch failed in SCTC in block:\n";
861                    PredBB->dump());
862         continue;
863       }
864 
865       assert(Result && "internal error analyzing conditional branch");
866       assert(CondBranch && "conditional branch expected");
867 
868       // Skip dynamic branches for now.
869       if (BF.getBinaryContext().MIB->isDynamicBranch(*CondBranch))
870         continue;
871 
872       // It's possible that PredBB is also a successor to BB that may have
873       // been processed by a previous iteration of the SCTC loop, in which
874       // case it may have been marked invalid.  We should skip rewriting in
875       // this case.
876       if (!PredBB->isValid()) {
877         assert(PredBB->isSuccessor(BB) &&
878                "PredBB should be valid if it is not a successor to BB");
879         continue;
880       }
881 
882       // We don't want to reverse direction of the branch in new order
883       // without further profile analysis.
884       const bool DirectionFlag = CondSucc == BB ? IsForwardCTC : !IsForwardCTC;
885       if (!shouldRewriteBranch(PredBB, *CondBranch, BB, DirectionFlag))
886         continue;
887 
888       // Record this block so that we don't try to optimize it twice.
889       BeenOptimized.insert(PredBB);
890 
891       uint64_t Count = 0;
892       if (CondSucc != BB) {
893         // Patch the new target address into the conditional branch.
894         MIB->reverseBranchCondition(*CondBranch, CalleeSymbol, Ctx);
895         // Since we reversed the condition on the branch we need to change
896         // the target for the unconditional branch or add a unconditional
897         // branch to the old target.  This has to be done manually since
898         // fixupBranches is not called after SCTC.
899         NeedsUncondBranch.emplace_back(PredBB, CondSucc);
900         Count = PredBB->getFallthroughBranchInfo().Count;
901       } else {
902         // Change destination of the conditional branch.
903         MIB->replaceBranchTarget(*CondBranch, CalleeSymbol, Ctx);
904         Count = PredBB->getTakenBranchInfo().Count;
905       }
906       const uint64_t CTCTakenFreq =
907           Count == BinaryBasicBlock::COUNT_NO_PROFILE ? 0 : Count;
908 
909       // Annotate it, so "isCall" returns true for this jcc
910       MIB->setConditionalTailCall(*CondBranch);
911       // Add info about the conditional tail call frequency, otherwise this
912       // info will be lost when we delete the associated BranchInfo entry
913       auto &CTCAnnotation =
914           MIB->getOrCreateAnnotationAs<uint64_t>(*CondBranch, "CTCTakenCount");
915       CTCAnnotation = CTCTakenFreq;
916       // Preserve Offset annotation, used in BAT.
917       // Instr is a direct tail call instruction that was created when CTCs are
918       // first expanded, and has the original CTC offset set.
919       if (std::optional<uint32_t> Offset = MIB->getOffset(*Instr))
920         MIB->setOffset(*CondBranch, *Offset);
921 
922       // Remove the unused successor which may be eliminated later
923       // if there are no other users.
924       PredBB->removeSuccessor(BB);
925       // Update BB execution count
926       if (CTCTakenFreq && CTCTakenFreq <= BB->getKnownExecutionCount())
927         BB->setExecutionCount(BB->getExecutionCount() - CTCTakenFreq);
928       else if (CTCTakenFreq > BB->getKnownExecutionCount())
929         BB->setExecutionCount(0);
930 
931       ++NumLocalCTCs;
932       LocalCTCTakenCount += CTCTakenFreq;
933       LocalCTCExecCount += PredBB->getKnownExecutionCount();
934     }
935 
936     // Remove the block from CFG if all predecessors were removed.
937     BB->markValid(isValid(BB));
938   }
939 
940   // Add unconditional branches at the end of BBs to new successors
941   // as long as the successor is not a fallthrough.
942   for (auto &Entry : NeedsUncondBranch) {
943     BinaryBasicBlock *PredBB = Entry.first;
944     const BinaryBasicBlock *CondSucc = Entry.second;
945 
946     const MCSymbol *TBB = nullptr;
947     const MCSymbol *FBB = nullptr;
948     MCInst *CondBranch = nullptr;
949     MCInst *UncondBranch = nullptr;
950     PredBB->analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
951 
952     // Find the next valid block.  Invalid blocks will be deleted
953     // so they shouldn't be considered fallthrough targets.
954     const BinaryBasicBlock *NextBlock =
955         BF.getLayout().getBasicBlockAfter(PredBB, false);
956     while (NextBlock && !isValid(NextBlock))
957       NextBlock = BF.getLayout().getBasicBlockAfter(NextBlock, false);
958 
959     // Get the unconditional successor to this block.
960     const BinaryBasicBlock *PredSucc = PredBB->getSuccessor();
961     assert(PredSucc && "The other branch should be a tail call");
962 
963     const bool HasFallthrough = (NextBlock && PredSucc == NextBlock);
964 
965     if (UncondBranch) {
966       if (HasFallthrough)
967         PredBB->eraseInstruction(PredBB->findInstruction(UncondBranch));
968       else
969         MIB->replaceBranchTarget(*UncondBranch, CondSucc->getLabel(), Ctx);
970     } else if (!HasFallthrough) {
971       MCInst Branch;
972       MIB->createUncondBranch(Branch, CondSucc->getLabel(), Ctx);
973       PredBB->addInstruction(Branch);
974     }
975   }
976 
977   if (NumLocalCTCs > 0) {
978     NumDoubleJumps += fixDoubleJumps(BF, true);
979     // Clean-up unreachable tail-call blocks.
980     const std::pair<unsigned, uint64_t> Stats = BF.eraseInvalidBBs();
981     DeletedBlocks += Stats.first;
982     DeletedBytes += Stats.second;
983 
984     assert(BF.validateCFG());
985   }
986 
987   LLVM_DEBUG(dbgs() << "BOLT: created " << NumLocalCTCs
988                     << " conditional tail calls from a total of "
989                     << NumLocalCTCCandidates << " candidates in function " << BF
990                     << ". CTCs execution count for this function is "
991                     << LocalCTCExecCount << " and CTC taken count is "
992                     << LocalCTCTakenCount << "\n";);
993 
994   NumTailCallsPatched += NumLocalCTCs;
995   NumCandidateTailCalls += NumLocalCTCCandidates;
996   CTCExecCount += LocalCTCExecCount;
997   CTCTakenCount += LocalCTCTakenCount;
998 
999   return NumLocalCTCs > 0;
1000 }
1001 
1002 Error SimplifyConditionalTailCalls::runOnFunctions(BinaryContext &BC) {
1003   if (!BC.isX86())
1004     return Error::success();
1005 
1006   for (auto &It : BC.getBinaryFunctions()) {
1007     BinaryFunction &Function = It.second;
1008 
1009     if (!shouldOptimize(Function))
1010       continue;
1011 
1012     if (fixTailCalls(Function)) {
1013       Modified.insert(&Function);
1014       Function.setHasCanonicalCFG(false);
1015     }
1016   }
1017 
1018   if (NumTailCallsPatched)
1019     BC.outs() << "BOLT-INFO: SCTC: patched " << NumTailCallsPatched
1020               << " tail calls (" << NumOrigForwardBranches << " forward)"
1021               << " tail calls (" << NumOrigBackwardBranches << " backward)"
1022               << " from a total of " << NumCandidateTailCalls
1023               << " while removing " << NumDoubleJumps << " double jumps"
1024               << " and removing " << DeletedBlocks << " basic blocks"
1025               << " totalling " << DeletedBytes
1026               << " bytes of code. CTCs total execution count is "
1027               << CTCExecCount << " and the number of times CTCs are taken is "
1028               << CTCTakenCount << "\n";
1029   return Error::success();
1030 }
1031 
1032 uint64_t ShortenInstructions::shortenInstructions(BinaryFunction &Function) {
1033   uint64_t Count = 0;
1034   const BinaryContext &BC = Function.getBinaryContext();
1035   for (BinaryBasicBlock &BB : Function) {
1036     for (MCInst &Inst : BB) {
1037       // Skip shortening instructions with Size annotation.
1038       if (BC.MIB->getSize(Inst))
1039         continue;
1040 
1041       MCInst OriginalInst;
1042       if (opts::Verbosity > 2)
1043         OriginalInst = Inst;
1044 
1045       if (!BC.MIB->shortenInstruction(Inst, *BC.STI))
1046         continue;
1047 
1048       if (opts::Verbosity > 2) {
1049         BC.scopeLock();
1050         BC.outs() << "BOLT-INFO: shortening:\nBOLT-INFO:    ";
1051         BC.printInstruction(BC.outs(), OriginalInst, 0, &Function);
1052         BC.outs() << "BOLT-INFO: to:";
1053         BC.printInstruction(BC.outs(), Inst, 0, &Function);
1054       }
1055 
1056       ++Count;
1057     }
1058   }
1059 
1060   return Count;
1061 }
1062 
1063 Error ShortenInstructions::runOnFunctions(BinaryContext &BC) {
1064   std::atomic<uint64_t> NumShortened{0};
1065   if (!BC.isX86())
1066     return Error::success();
1067 
1068   ParallelUtilities::runOnEachFunction(
1069       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR,
1070       [&](BinaryFunction &BF) { NumShortened += shortenInstructions(BF); },
1071       nullptr, "ShortenInstructions");
1072 
1073   if (NumShortened)
1074     BC.outs() << "BOLT-INFO: " << NumShortened
1075               << " instructions were shortened\n";
1076   return Error::success();
1077 }
1078 
1079 void Peepholes::addTailcallTraps(BinaryFunction &Function) {
1080   MCPlusBuilder *MIB = Function.getBinaryContext().MIB.get();
1081   for (BinaryBasicBlock &BB : Function) {
1082     MCInst *Inst = BB.getLastNonPseudoInstr();
1083     if (Inst && MIB->isTailCall(*Inst) && MIB->isIndirectBranch(*Inst)) {
1084       MCInst Trap;
1085       MIB->createTrap(Trap);
1086       BB.addInstruction(Trap);
1087       ++TailCallTraps;
1088     }
1089   }
1090 }
1091 
1092 void Peepholes::removeUselessCondBranches(BinaryFunction &Function) {
1093   for (BinaryBasicBlock &BB : Function) {
1094     if (BB.succ_size() != 2)
1095       continue;
1096 
1097     BinaryBasicBlock *CondBB = BB.getConditionalSuccessor(true);
1098     BinaryBasicBlock *UncondBB = BB.getConditionalSuccessor(false);
1099     if (CondBB != UncondBB)
1100       continue;
1101 
1102     const MCSymbol *TBB = nullptr;
1103     const MCSymbol *FBB = nullptr;
1104     MCInst *CondBranch = nullptr;
1105     MCInst *UncondBranch = nullptr;
1106     bool Result = BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch);
1107 
1108     // analyzeBranch() can fail due to unusual branch instructions,
1109     // e.g. jrcxz, or jump tables (indirect jump).
1110     if (!Result || !CondBranch)
1111       continue;
1112 
1113     BB.removeDuplicateConditionalSuccessor(CondBranch);
1114     ++NumUselessCondBranches;
1115   }
1116 }
1117 
1118 Error Peepholes::runOnFunctions(BinaryContext &BC) {
1119   const char Opts =
1120       std::accumulate(opts::Peepholes.begin(), opts::Peepholes.end(), 0,
1121                       [](const char A, const PeepholeOpts B) { return A | B; });
1122   if (Opts == PEEP_NONE)
1123     return Error::success();
1124 
1125   for (auto &It : BC.getBinaryFunctions()) {
1126     BinaryFunction &Function = It.second;
1127     if (shouldOptimize(Function)) {
1128       if (Opts & PEEP_DOUBLE_JUMPS)
1129         NumDoubleJumps += fixDoubleJumps(Function, false);
1130       if (Opts & PEEP_TAILCALL_TRAPS)
1131         addTailcallTraps(Function);
1132       if (Opts & PEEP_USELESS_BRANCHES)
1133         removeUselessCondBranches(Function);
1134       assert(Function.validateCFG());
1135     }
1136   }
1137   BC.outs() << "BOLT-INFO: Peephole: " << NumDoubleJumps
1138             << " double jumps patched.\n"
1139             << "BOLT-INFO: Peephole: " << TailCallTraps
1140             << " tail call traps inserted.\n"
1141             << "BOLT-INFO: Peephole: " << NumUselessCondBranches
1142             << " useless conditional branches removed.\n";
1143   return Error::success();
1144 }
1145 
1146 bool SimplifyRODataLoads::simplifyRODataLoads(BinaryFunction &BF) {
1147   BinaryContext &BC = BF.getBinaryContext();
1148   MCPlusBuilder *MIB = BC.MIB.get();
1149 
1150   uint64_t NumLocalLoadsSimplified = 0;
1151   uint64_t NumDynamicLocalLoadsSimplified = 0;
1152   uint64_t NumLocalLoadsFound = 0;
1153   uint64_t NumDynamicLocalLoadsFound = 0;
1154 
1155   for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
1156     for (MCInst &Inst : *BB) {
1157       unsigned Opcode = Inst.getOpcode();
1158       const MCInstrDesc &Desc = BC.MII->get(Opcode);
1159 
1160       // Skip instructions that do not load from memory.
1161       if (!Desc.mayLoad())
1162         continue;
1163 
1164       // Try to statically evaluate the target memory address;
1165       uint64_t TargetAddress;
1166 
1167       if (MIB->hasPCRelOperand(Inst)) {
1168         // Try to find the symbol that corresponds to the PC-relative operand.
1169         MCOperand *DispOpI = MIB->getMemOperandDisp(Inst);
1170         assert(DispOpI != Inst.end() && "expected PC-relative displacement");
1171         assert(DispOpI->isExpr() &&
1172                "found PC-relative with non-symbolic displacement");
1173 
1174         // Get displacement symbol.
1175         const MCSymbol *DisplSymbol;
1176         uint64_t DisplOffset;
1177 
1178         std::tie(DisplSymbol, DisplOffset) =
1179             MIB->getTargetSymbolInfo(DispOpI->getExpr());
1180 
1181         if (!DisplSymbol)
1182           continue;
1183 
1184         // Look up the symbol address in the global symbols map of the binary
1185         // context object.
1186         BinaryData *BD = BC.getBinaryDataByName(DisplSymbol->getName());
1187         if (!BD)
1188           continue;
1189         TargetAddress = BD->getAddress() + DisplOffset;
1190       } else if (!MIB->evaluateMemOperandTarget(Inst, TargetAddress)) {
1191         continue;
1192       }
1193 
1194       // Get the contents of the section containing the target address of the
1195       // memory operand. We are only interested in read-only sections.
1196       ErrorOr<BinarySection &> DataSection =
1197           BC.getSectionForAddress(TargetAddress);
1198       if (!DataSection || DataSection->isWritable())
1199         continue;
1200 
1201       if (BC.getRelocationAt(TargetAddress) ||
1202           BC.getDynamicRelocationAt(TargetAddress))
1203         continue;
1204 
1205       uint32_t Offset = TargetAddress - DataSection->getAddress();
1206       StringRef ConstantData = DataSection->getContents();
1207 
1208       ++NumLocalLoadsFound;
1209       if (BB->hasProfile())
1210         NumDynamicLocalLoadsFound += BB->getExecutionCount();
1211 
1212       if (MIB->replaceMemOperandWithImm(Inst, ConstantData, Offset)) {
1213         ++NumLocalLoadsSimplified;
1214         if (BB->hasProfile())
1215           NumDynamicLocalLoadsSimplified += BB->getExecutionCount();
1216       }
1217     }
1218   }
1219 
1220   NumLoadsFound += NumLocalLoadsFound;
1221   NumDynamicLoadsFound += NumDynamicLocalLoadsFound;
1222   NumLoadsSimplified += NumLocalLoadsSimplified;
1223   NumDynamicLoadsSimplified += NumDynamicLocalLoadsSimplified;
1224 
1225   return NumLocalLoadsSimplified > 0;
1226 }
1227 
1228 Error SimplifyRODataLoads::runOnFunctions(BinaryContext &BC) {
1229   for (auto &It : BC.getBinaryFunctions()) {
1230     BinaryFunction &Function = It.second;
1231     if (shouldOptimize(Function) && simplifyRODataLoads(Function))
1232       Modified.insert(&Function);
1233   }
1234 
1235   BC.outs() << "BOLT-INFO: simplified " << NumLoadsSimplified << " out of "
1236             << NumLoadsFound << " loads from a statically computed address.\n"
1237             << "BOLT-INFO: dynamic loads simplified: "
1238             << NumDynamicLoadsSimplified << "\n"
1239             << "BOLT-INFO: dynamic loads found: " << NumDynamicLoadsFound
1240             << "\n";
1241   return Error::success();
1242 }
1243 
1244 Error AssignSections::runOnFunctions(BinaryContext &BC) {
1245   for (BinaryFunction *Function : BC.getInjectedBinaryFunctions()) {
1246     Function->setCodeSectionName(BC.getInjectedCodeSectionName());
1247     Function->setColdCodeSectionName(BC.getInjectedColdCodeSectionName());
1248   }
1249 
1250   // In non-relocation mode functions have pre-assigned section names.
1251   if (!BC.HasRelocations)
1252     return Error::success();
1253 
1254   const bool UseColdSection =
1255       BC.NumProfiledFuncs > 0 ||
1256       opts::ReorderFunctions == ReorderFunctions::RT_USER;
1257   for (auto &BFI : BC.getBinaryFunctions()) {
1258     BinaryFunction &Function = BFI.second;
1259     if (opts::isHotTextMover(Function)) {
1260       Function.setCodeSectionName(BC.getHotTextMoverSectionName());
1261       Function.setColdCodeSectionName(BC.getHotTextMoverSectionName());
1262       continue;
1263     }
1264 
1265     if (!UseColdSection || Function.hasValidIndex())
1266       Function.setCodeSectionName(BC.getMainCodeSectionName());
1267     else
1268       Function.setCodeSectionName(BC.getColdCodeSectionName());
1269 
1270     if (Function.isSplit())
1271       Function.setColdCodeSectionName(BC.getColdCodeSectionName());
1272   }
1273   return Error::success();
1274 }
1275 
1276 Error PrintProfileStats::runOnFunctions(BinaryContext &BC) {
1277   double FlowImbalanceMean = 0.0;
1278   size_t NumBlocksConsidered = 0;
1279   double WorstBias = 0.0;
1280   const BinaryFunction *WorstBiasFunc = nullptr;
1281 
1282   // For each function CFG, we fill an IncomingMap with the sum of the frequency
1283   // of incoming edges for each BB. Likewise for each OutgoingMap and the sum
1284   // of the frequency of outgoing edges.
1285   using FlowMapTy = std::unordered_map<const BinaryBasicBlock *, uint64_t>;
1286   std::unordered_map<const BinaryFunction *, FlowMapTy> TotalIncomingMaps;
1287   std::unordered_map<const BinaryFunction *, FlowMapTy> TotalOutgoingMaps;
1288 
1289   // Compute mean
1290   for (const auto &BFI : BC.getBinaryFunctions()) {
1291     const BinaryFunction &Function = BFI.second;
1292     if (Function.empty() || !Function.isSimple())
1293       continue;
1294     FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
1295     FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
1296     for (const BinaryBasicBlock &BB : Function) {
1297       uint64_t TotalOutgoing = 0ULL;
1298       auto SuccBIIter = BB.branch_info_begin();
1299       for (BinaryBasicBlock *Succ : BB.successors()) {
1300         uint64_t Count = SuccBIIter->Count;
1301         if (Count == BinaryBasicBlock::COUNT_NO_PROFILE || Count == 0) {
1302           ++SuccBIIter;
1303           continue;
1304         }
1305         TotalOutgoing += Count;
1306         IncomingMap[Succ] += Count;
1307         ++SuccBIIter;
1308       }
1309       OutgoingMap[&BB] = TotalOutgoing;
1310     }
1311 
1312     size_t NumBlocks = 0;
1313     double Mean = 0.0;
1314     for (const BinaryBasicBlock &BB : Function) {
1315       // Do not compute score for low frequency blocks, entry or exit blocks
1316       if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0 || BB.isEntryPoint())
1317         continue;
1318       ++NumBlocks;
1319       const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
1320       Mean += fabs(Difference / IncomingMap[&BB]);
1321     }
1322 
1323     FlowImbalanceMean += Mean;
1324     NumBlocksConsidered += NumBlocks;
1325     if (!NumBlocks)
1326       continue;
1327     double FuncMean = Mean / NumBlocks;
1328     if (FuncMean > WorstBias) {
1329       WorstBias = FuncMean;
1330       WorstBiasFunc = &Function;
1331     }
1332   }
1333   if (NumBlocksConsidered > 0)
1334     FlowImbalanceMean /= NumBlocksConsidered;
1335 
1336   // Compute standard deviation
1337   NumBlocksConsidered = 0;
1338   double FlowImbalanceVar = 0.0;
1339   for (const auto &BFI : BC.getBinaryFunctions()) {
1340     const BinaryFunction &Function = BFI.second;
1341     if (Function.empty() || !Function.isSimple())
1342       continue;
1343     FlowMapTy &IncomingMap = TotalIncomingMaps[&Function];
1344     FlowMapTy &OutgoingMap = TotalOutgoingMaps[&Function];
1345     for (const BinaryBasicBlock &BB : Function) {
1346       if (IncomingMap[&BB] < 100 || OutgoingMap[&BB] == 0)
1347         continue;
1348       ++NumBlocksConsidered;
1349       const double Difference = (double)OutgoingMap[&BB] - IncomingMap[&BB];
1350       FlowImbalanceVar +=
1351           pow(fabs(Difference / IncomingMap[&BB]) - FlowImbalanceMean, 2);
1352     }
1353   }
1354   if (NumBlocksConsidered) {
1355     FlowImbalanceVar /= NumBlocksConsidered;
1356     FlowImbalanceVar = sqrt(FlowImbalanceVar);
1357   }
1358 
1359   // Report to user
1360   BC.outs() << format("BOLT-INFO: Profile bias score: %.4lf%% StDev: %.4lf%%\n",
1361                       (100.0 * FlowImbalanceMean), (100.0 * FlowImbalanceVar));
1362   if (WorstBiasFunc && opts::Verbosity >= 1) {
1363     BC.outs() << "Worst average bias observed in "
1364               << WorstBiasFunc->getPrintName() << "\n";
1365     LLVM_DEBUG(WorstBiasFunc->dump());
1366   }
1367   return Error::success();
1368 }
1369 
1370 Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
1371   uint64_t NumRegularFunctions = 0;
1372   uint64_t NumStaleProfileFunctions = 0;
1373   uint64_t NumAllStaleFunctions = 0;
1374   uint64_t NumInferredFunctions = 0;
1375   uint64_t NumNonSimpleProfiledFunctions = 0;
1376   uint64_t NumUnknownControlFlowFunctions = 0;
1377   uint64_t TotalSampleCount = 0;
1378   uint64_t StaleSampleCount = 0;
1379   uint64_t InferredSampleCount = 0;
1380   std::vector<const BinaryFunction *> ProfiledFunctions;
1381   const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n";
1382   for (auto &BFI : BC.getBinaryFunctions()) {
1383     const BinaryFunction &Function = BFI.second;
1384 
1385     // Ignore PLT functions for stats.
1386     if (Function.isPLTFunction())
1387       continue;
1388 
1389     ++NumRegularFunctions;
1390 
1391     if (!Function.isSimple()) {
1392       if (Function.hasProfile())
1393         ++NumNonSimpleProfiledFunctions;
1394       continue;
1395     }
1396 
1397     if (Function.hasUnknownControlFlow()) {
1398       if (opts::PrintUnknownCFG)
1399         Function.dump();
1400       else if (opts::PrintUnknown)
1401         BC.errs() << "function with unknown control flow: " << Function << '\n';
1402 
1403       ++NumUnknownControlFlowFunctions;
1404     }
1405 
1406     if (!Function.hasProfile())
1407       continue;
1408 
1409     uint64_t SampleCount = Function.getRawBranchCount();
1410     TotalSampleCount += SampleCount;
1411 
1412     if (Function.hasValidProfile()) {
1413       ProfiledFunctions.push_back(&Function);
1414       if (Function.hasInferredProfile()) {
1415         ++NumInferredFunctions;
1416         InferredSampleCount += SampleCount;
1417         ++NumAllStaleFunctions;
1418       }
1419     } else {
1420       if (opts::ReportStaleFuncs) {
1421         BC.outs() << StaleFuncsHeader;
1422         StaleFuncsHeader = "";
1423         BC.outs() << "  " << Function << '\n';
1424       }
1425       ++NumStaleProfileFunctions;
1426       StaleSampleCount += SampleCount;
1427       ++NumAllStaleFunctions;
1428     }
1429   }
1430   BC.NumProfiledFuncs = ProfiledFunctions.size();
1431   BC.NumStaleProfileFuncs = NumStaleProfileFunctions;
1432 
1433   const size_t NumAllProfiledFunctions =
1434       ProfiledFunctions.size() + NumStaleProfileFunctions;
1435   BC.outs() << "BOLT-INFO: " << NumAllProfiledFunctions << " out of "
1436             << NumRegularFunctions << " functions in the binary ("
1437             << format("%.1f", NumAllProfiledFunctions /
1438                                   (float)NumRegularFunctions * 100.0f)
1439             << "%) have non-empty execution profile\n";
1440   if (NumNonSimpleProfiledFunctions) {
1441     BC.outs() << "BOLT-INFO: " << NumNonSimpleProfiledFunctions << " function"
1442               << (NumNonSimpleProfiledFunctions == 1 ? "" : "s")
1443               << " with profile could not be optimized\n";
1444   }
1445   if (NumAllStaleFunctions) {
1446     const float PctStale =
1447         NumAllStaleFunctions / (float)NumAllProfiledFunctions * 100.0f;
1448     const float PctStaleFuncsWithEqualBlockCount =
1449         (float)BC.Stats.NumStaleFuncsWithEqualBlockCount /
1450         NumAllStaleFunctions * 100.0f;
1451     const float PctStaleBlocksWithEqualIcount =
1452         (float)BC.Stats.NumStaleBlocksWithEqualIcount /
1453         BC.Stats.NumStaleBlocks * 100.0f;
1454     auto printErrorOrWarning = [&]() {
1455       if (PctStale > opts::StaleThreshold)
1456         BC.errs() << "BOLT-ERROR: ";
1457       else
1458         BC.errs() << "BOLT-WARNING: ";
1459     };
1460     printErrorOrWarning();
1461     BC.errs() << NumAllStaleFunctions
1462               << format(" (%.1f%% of all profiled)", PctStale) << " function"
1463               << (NumAllStaleFunctions == 1 ? "" : "s")
1464               << " have invalid (possibly stale) profile."
1465                  " Use -report-stale to see the list.\n";
1466     if (TotalSampleCount > 0) {
1467       printErrorOrWarning();
1468       BC.errs() << (StaleSampleCount + InferredSampleCount) << " out of "
1469                 << TotalSampleCount << " samples in the binary ("
1470                 << format("%.1f",
1471                           ((100.0f * (StaleSampleCount + InferredSampleCount)) /
1472                            TotalSampleCount))
1473                 << "%) belong to functions with invalid"
1474                    " (possibly stale) profile.\n";
1475     }
1476     BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleFuncsWithEqualBlockCount
1477               << " stale function"
1478               << (BC.Stats.NumStaleFuncsWithEqualBlockCount == 1 ? "" : "s")
1479               << format(" (%.1f%% of all stale)",
1480                         PctStaleFuncsWithEqualBlockCount)
1481               << " have matching block count.\n";
1482     BC.outs() << "BOLT-INFO: " << BC.Stats.NumStaleBlocksWithEqualIcount
1483               << " stale block"
1484               << (BC.Stats.NumStaleBlocksWithEqualIcount == 1 ? "" : "s")
1485               << format(" (%.1f%% of all stale)", PctStaleBlocksWithEqualIcount)
1486               << " have matching icount.\n";
1487     if (PctStale > opts::StaleThreshold) {
1488       return createFatalBOLTError(
1489           Twine("BOLT-ERROR: stale functions exceed specified threshold of ") +
1490           Twine(opts::StaleThreshold.getValue()) + Twine("%. Exiting.\n"));
1491     }
1492   }
1493   if (NumInferredFunctions) {
1494     BC.outs() << format(
1495         "BOLT-INFO: inferred profile for %d (%.2f%% of profiled, "
1496         "%.2f%% of stale) functions responsible for %.2f%% samples"
1497         " (%zu out of %zu)\n",
1498         NumInferredFunctions,
1499         100.0 * NumInferredFunctions / NumAllProfiledFunctions,
1500         100.0 * NumInferredFunctions / NumAllStaleFunctions,
1501         100.0 * InferredSampleCount / TotalSampleCount, InferredSampleCount,
1502         TotalSampleCount);
1503     BC.outs() << format(
1504         "BOLT-INFO: inference found an exact match for %.2f%% of basic blocks"
1505         " (%zu out of %zu stale) responsible for %.2f%% samples"
1506         " (%zu out of %zu stale)\n",
1507         100.0 * BC.Stats.NumMatchedBlocks / BC.Stats.NumStaleBlocks,
1508         BC.Stats.NumMatchedBlocks, BC.Stats.NumStaleBlocks,
1509         100.0 * BC.Stats.MatchedSampleCount / BC.Stats.StaleSampleCount,
1510         BC.Stats.MatchedSampleCount, BC.Stats.StaleSampleCount);
1511   }
1512 
1513   if (const uint64_t NumUnusedObjects = BC.getNumUnusedProfiledObjects()) {
1514     BC.outs() << "BOLT-INFO: profile for " << NumUnusedObjects
1515               << " objects was ignored\n";
1516   }
1517 
1518   if (ProfiledFunctions.size() > 10) {
1519     if (opts::Verbosity >= 1) {
1520       BC.outs() << "BOLT-INFO: top called functions are:\n";
1521       llvm::sort(ProfiledFunctions,
1522                  [](const BinaryFunction *A, const BinaryFunction *B) {
1523                    return B->getExecutionCount() < A->getExecutionCount();
1524                  });
1525       auto SFI = ProfiledFunctions.begin();
1526       auto SFIend = ProfiledFunctions.end();
1527       for (unsigned I = 0u; I < opts::TopCalledLimit && SFI != SFIend;
1528            ++SFI, ++I)
1529         BC.outs() << "  " << **SFI << " : " << (*SFI)->getExecutionCount()
1530                   << '\n';
1531     }
1532   }
1533 
1534   if (!opts::PrintSortedBy.empty()) {
1535     std::vector<BinaryFunction *> Functions;
1536     std::map<const BinaryFunction *, DynoStats> Stats;
1537 
1538     for (auto &BFI : BC.getBinaryFunctions()) {
1539       BinaryFunction &BF = BFI.second;
1540       if (shouldOptimize(BF) && BF.hasValidProfile()) {
1541         Functions.push_back(&BF);
1542         Stats.emplace(&BF, getDynoStats(BF));
1543       }
1544     }
1545 
1546     const bool SortAll =
1547         llvm::is_contained(opts::PrintSortedBy, DynoStats::LAST_DYNO_STAT);
1548 
1549     const bool Ascending =
1550         opts::DynoStatsSortOrderOpt == opts::DynoStatsSortOrder::Ascending;
1551 
1552     if (SortAll) {
1553       llvm::stable_sort(Functions,
1554                         [Ascending, &Stats](const BinaryFunction *A,
1555                                             const BinaryFunction *B) {
1556                           return Ascending ? Stats.at(A) < Stats.at(B)
1557                                            : Stats.at(B) < Stats.at(A);
1558                         });
1559     } else {
1560       llvm::stable_sort(
1561           Functions, [Ascending, &Stats](const BinaryFunction *A,
1562                                          const BinaryFunction *B) {
1563             const DynoStats &StatsA = Stats.at(A);
1564             const DynoStats &StatsB = Stats.at(B);
1565             return Ascending ? StatsA.lessThan(StatsB, opts::PrintSortedBy)
1566                              : StatsB.lessThan(StatsA, opts::PrintSortedBy);
1567           });
1568     }
1569 
1570     BC.outs() << "BOLT-INFO: top functions sorted by ";
1571     if (SortAll) {
1572       BC.outs() << "dyno stats";
1573     } else {
1574       BC.outs() << "(";
1575       bool PrintComma = false;
1576       for (const DynoStats::Category Category : opts::PrintSortedBy) {
1577         if (PrintComma)
1578           BC.outs() << ", ";
1579         BC.outs() << DynoStats::Description(Category);
1580         PrintComma = true;
1581       }
1582       BC.outs() << ")";
1583     }
1584 
1585     BC.outs() << " are:\n";
1586     auto SFI = Functions.begin();
1587     for (unsigned I = 0; I < 100 && SFI != Functions.end(); ++SFI, ++I) {
1588       const DynoStats Stats = getDynoStats(**SFI);
1589       BC.outs() << "  " << **SFI;
1590       if (!SortAll) {
1591         BC.outs() << " (";
1592         bool PrintComma = false;
1593         for (const DynoStats::Category Category : opts::PrintSortedBy) {
1594           if (PrintComma)
1595             BC.outs() << ", ";
1596           BC.outs() << dynoStatsOptName(Category) << "=" << Stats[Category];
1597           PrintComma = true;
1598         }
1599         BC.outs() << ")";
1600       }
1601       BC.outs() << "\n";
1602     }
1603   }
1604 
1605   if (!BC.TrappedFunctions.empty()) {
1606     BC.errs() << "BOLT-WARNING: " << BC.TrappedFunctions.size() << " function"
1607               << (BC.TrappedFunctions.size() > 1 ? "s" : "")
1608               << " will trap on entry. Use -trap-avx512=0 to disable"
1609                  " traps.";
1610     if (opts::Verbosity >= 1 || BC.TrappedFunctions.size() <= 5) {
1611       BC.errs() << '\n';
1612       for (const BinaryFunction *Function : BC.TrappedFunctions)
1613         BC.errs() << "  " << *Function << '\n';
1614     } else {
1615       BC.errs() << " Use -v=1 to see the list.\n";
1616     }
1617   }
1618 
1619   // Print information on missed macro-fusion opportunities seen on input.
1620   if (BC.Stats.MissedMacroFusionPairs) {
1621     BC.outs() << format(
1622         "BOLT-INFO: the input contains %zu (dynamic count : %zu)"
1623         " opportunities for macro-fusion optimization",
1624         BC.Stats.MissedMacroFusionPairs, BC.Stats.MissedMacroFusionExecCount);
1625     switch (opts::AlignMacroOpFusion) {
1626     case MFT_NONE:
1627       BC.outs() << ". Use -align-macro-fusion to fix.\n";
1628       break;
1629     case MFT_HOT:
1630       BC.outs() << ". Will fix instances on a hot path.\n";
1631       break;
1632     case MFT_ALL:
1633       BC.outs() << " that are going to be fixed\n";
1634       break;
1635     }
1636   }
1637 
1638   // Collect and print information about suboptimal code layout on input.
1639   if (opts::ReportBadLayout) {
1640     std::vector<BinaryFunction *> SuboptimalFuncs;
1641     for (auto &BFI : BC.getBinaryFunctions()) {
1642       BinaryFunction &BF = BFI.second;
1643       if (!BF.hasValidProfile())
1644         continue;
1645 
1646       const uint64_t HotThreshold =
1647           std::max<uint64_t>(BF.getKnownExecutionCount(), 1);
1648       bool HotSeen = false;
1649       for (const BinaryBasicBlock *BB : BF.getLayout().rblocks()) {
1650         if (!HotSeen && BB->getKnownExecutionCount() > HotThreshold) {
1651           HotSeen = true;
1652           continue;
1653         }
1654         if (HotSeen && BB->getKnownExecutionCount() == 0) {
1655           SuboptimalFuncs.push_back(&BF);
1656           break;
1657         }
1658       }
1659     }
1660 
1661     if (!SuboptimalFuncs.empty()) {
1662       llvm::sort(SuboptimalFuncs,
1663                  [](const BinaryFunction *A, const BinaryFunction *B) {
1664                    return A->getKnownExecutionCount() / A->getSize() >
1665                           B->getKnownExecutionCount() / B->getSize();
1666                  });
1667 
1668       BC.outs() << "BOLT-INFO: " << SuboptimalFuncs.size()
1669                 << " functions have "
1670                    "cold code in the middle of hot code. Top functions are:\n";
1671       for (unsigned I = 0;
1672            I < std::min(static_cast<size_t>(opts::ReportBadLayout),
1673                         SuboptimalFuncs.size());
1674            ++I)
1675         SuboptimalFuncs[I]->print(BC.outs());
1676     }
1677   }
1678 
1679   if (NumUnknownControlFlowFunctions) {
1680     BC.outs() << "BOLT-INFO: " << NumUnknownControlFlowFunctions
1681               << " functions have instructions with unknown control flow";
1682     if (!opts::PrintUnknown)
1683       BC.outs() << ". Use -print-unknown to see the list.";
1684     BC.outs() << '\n';
1685   }
1686   return Error::success();
1687 }
1688 
1689 Error InstructionLowering::runOnFunctions(BinaryContext &BC) {
1690   for (auto &BFI : BC.getBinaryFunctions())
1691     for (BinaryBasicBlock &BB : BFI.second)
1692       for (MCInst &Instruction : BB)
1693         BC.MIB->lowerTailCall(Instruction);
1694   return Error::success();
1695 }
1696 
1697 Error StripRepRet::runOnFunctions(BinaryContext &BC) {
1698   if (!BC.isX86())
1699     return Error::success();
1700 
1701   uint64_t NumPrefixesRemoved = 0;
1702   uint64_t NumBytesSaved = 0;
1703   for (auto &BFI : BC.getBinaryFunctions()) {
1704     for (BinaryBasicBlock &BB : BFI.second) {
1705       auto LastInstRIter = BB.getLastNonPseudo();
1706       if (LastInstRIter == BB.rend() || !BC.MIB->isReturn(*LastInstRIter) ||
1707           !BC.MIB->deleteREPPrefix(*LastInstRIter))
1708         continue;
1709 
1710       NumPrefixesRemoved += BB.getKnownExecutionCount();
1711       ++NumBytesSaved;
1712     }
1713   }
1714 
1715   if (NumBytesSaved)
1716     BC.outs() << "BOLT-INFO: removed " << NumBytesSaved
1717               << " 'repz' prefixes"
1718                  " with estimated execution count of "
1719               << NumPrefixesRemoved << " times.\n";
1720   return Error::success();
1721 }
1722 
1723 Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
1724   if (!BC.isX86())
1725     return Error::success();
1726 
1727   uint64_t NumInlined = 0;
1728   uint64_t NumInlinedDyno = 0;
1729   for (auto &BFI : BC.getBinaryFunctions()) {
1730     for (BinaryBasicBlock &BB : BFI.second) {
1731       for (auto II = BB.begin(); II != BB.end(); ++II) {
1732         MCInst &Inst = *II;
1733 
1734         if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
1735             !Inst.getOperand(0).isExpr())
1736           continue;
1737 
1738         const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
1739         if (CalleeSymbol->getName() != "memcpy" &&
1740             CalleeSymbol->getName() != "memcpy@PLT" &&
1741             CalleeSymbol->getName() != "_memcpy8")
1742           continue;
1743 
1744         const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
1745         const bool IsTailCall = BC.MIB->isTailCall(Inst);
1746 
1747         const InstructionListType NewCode =
1748             BC.MIB->createInlineMemcpy(IsMemcpy8);
1749         II = BB.replaceInstruction(II, NewCode);
1750         std::advance(II, NewCode.size() - 1);
1751         if (IsTailCall) {
1752           MCInst Return;
1753           BC.MIB->createReturn(Return);
1754           II = BB.insertInstruction(std::next(II), std::move(Return));
1755         }
1756 
1757         ++NumInlined;
1758         NumInlinedDyno += BB.getKnownExecutionCount();
1759       }
1760     }
1761   }
1762 
1763   if (NumInlined) {
1764     BC.outs() << "BOLT-INFO: inlined " << NumInlined << " memcpy() calls";
1765     if (NumInlinedDyno)
1766       BC.outs() << ". The calls were executed " << NumInlinedDyno
1767                 << " times based on profile.";
1768     BC.outs() << '\n';
1769   }
1770   return Error::success();
1771 }
1772 
1773 bool SpecializeMemcpy1::shouldOptimize(const BinaryFunction &Function) const {
1774   if (!BinaryFunctionPass::shouldOptimize(Function))
1775     return false;
1776 
1777   for (const std::string &FunctionSpec : Spec) {
1778     StringRef FunctionName = StringRef(FunctionSpec).split(':').first;
1779     if (Function.hasNameRegex(FunctionName))
1780       return true;
1781   }
1782 
1783   return false;
1784 }
1785 
1786 std::set<size_t> SpecializeMemcpy1::getCallSitesToOptimize(
1787     const BinaryFunction &Function) const {
1788   StringRef SitesString;
1789   for (const std::string &FunctionSpec : Spec) {
1790     StringRef FunctionName;
1791     std::tie(FunctionName, SitesString) = StringRef(FunctionSpec).split(':');
1792     if (Function.hasNameRegex(FunctionName))
1793       break;
1794     SitesString = "";
1795   }
1796 
1797   std::set<size_t> Sites;
1798   SmallVector<StringRef, 4> SitesVec;
1799   SitesString.split(SitesVec, ':');
1800   for (StringRef SiteString : SitesVec) {
1801     if (SiteString.empty())
1802       continue;
1803     size_t Result;
1804     if (!SiteString.getAsInteger(10, Result))
1805       Sites.emplace(Result);
1806   }
1807 
1808   return Sites;
1809 }
1810 
1811 Error SpecializeMemcpy1::runOnFunctions(BinaryContext &BC) {
1812   if (!BC.isX86())
1813     return Error::success();
1814 
1815   uint64_t NumSpecialized = 0;
1816   uint64_t NumSpecializedDyno = 0;
1817   for (auto &BFI : BC.getBinaryFunctions()) {
1818     BinaryFunction &Function = BFI.second;
1819     if (!shouldOptimize(Function))
1820       continue;
1821 
1822     std::set<size_t> CallsToOptimize = getCallSitesToOptimize(Function);
1823     auto shouldOptimize = [&](size_t N) {
1824       return CallsToOptimize.empty() || CallsToOptimize.count(N);
1825     };
1826 
1827     std::vector<BinaryBasicBlock *> Blocks(Function.pbegin(), Function.pend());
1828     size_t CallSiteID = 0;
1829     for (BinaryBasicBlock *CurBB : Blocks) {
1830       for (auto II = CurBB->begin(); II != CurBB->end(); ++II) {
1831         MCInst &Inst = *II;
1832 
1833         if (!BC.MIB->isCall(Inst) || MCPlus::getNumPrimeOperands(Inst) != 1 ||
1834             !Inst.getOperand(0).isExpr())
1835           continue;
1836 
1837         const MCSymbol *CalleeSymbol = BC.MIB->getTargetSymbol(Inst);
1838         if (CalleeSymbol->getName() != "memcpy" &&
1839             CalleeSymbol->getName() != "memcpy@PLT")
1840           continue;
1841 
1842         if (BC.MIB->isTailCall(Inst))
1843           continue;
1844 
1845         ++CallSiteID;
1846 
1847         if (!shouldOptimize(CallSiteID))
1848           continue;
1849 
1850         // Create a copy of a call to memcpy(dest, src, size).
1851         MCInst MemcpyInstr = Inst;
1852 
1853         BinaryBasicBlock *OneByteMemcpyBB = CurBB->splitAt(II);
1854 
1855         BinaryBasicBlock *NextBB = nullptr;
1856         if (OneByteMemcpyBB->getNumNonPseudos() > 1) {
1857           NextBB = OneByteMemcpyBB->splitAt(OneByteMemcpyBB->begin());
1858           NextBB->eraseInstruction(NextBB->begin());
1859         } else {
1860           NextBB = OneByteMemcpyBB->getSuccessor();
1861           OneByteMemcpyBB->eraseInstruction(OneByteMemcpyBB->begin());
1862           assert(NextBB && "unexpected call to memcpy() with no return");
1863         }
1864 
1865         BinaryBasicBlock *MemcpyBB = Function.addBasicBlock();
1866         MemcpyBB->setOffset(CurBB->getInputOffset());
1867         InstructionListType CmpJCC =
1868             BC.MIB->createCmpJE(BC.MIB->getIntArgRegister(2), 1,
1869                                 OneByteMemcpyBB->getLabel(), BC.Ctx.get());
1870         CurBB->addInstructions(CmpJCC);
1871         CurBB->addSuccessor(MemcpyBB);
1872 
1873         MemcpyBB->addInstruction(std::move(MemcpyInstr));
1874         MemcpyBB->addSuccessor(NextBB);
1875         MemcpyBB->setCFIState(NextBB->getCFIState());
1876         MemcpyBB->setExecutionCount(0);
1877 
1878         // To prevent the actual call from being moved to cold, we set its
1879         // execution count to 1.
1880         if (CurBB->getKnownExecutionCount() > 0)
1881           MemcpyBB->setExecutionCount(1);
1882 
1883         InstructionListType OneByteMemcpy = BC.MIB->createOneByteMemcpy();
1884         OneByteMemcpyBB->addInstructions(OneByteMemcpy);
1885 
1886         ++NumSpecialized;
1887         NumSpecializedDyno += CurBB->getKnownExecutionCount();
1888 
1889         CurBB = NextBB;
1890 
1891         // Note: we don't expect the next instruction to be a call to memcpy.
1892         II = CurBB->begin();
1893       }
1894     }
1895   }
1896 
1897   if (NumSpecialized) {
1898     BC.outs() << "BOLT-INFO: specialized " << NumSpecialized
1899               << " memcpy() call sites for size 1";
1900     if (NumSpecializedDyno)
1901       BC.outs() << ". The calls were executed " << NumSpecializedDyno
1902                 << " times based on profile.";
1903     BC.outs() << '\n';
1904   }
1905   return Error::success();
1906 }
1907 
1908 void RemoveNops::runOnFunction(BinaryFunction &BF) {
1909   const BinaryContext &BC = BF.getBinaryContext();
1910   for (BinaryBasicBlock &BB : BF) {
1911     for (int64_t I = BB.size() - 1; I >= 0; --I) {
1912       MCInst &Inst = BB.getInstructionAtIndex(I);
1913       if (BC.MIB->isNoop(Inst) && BC.MIB->hasAnnotation(Inst, "NOP"))
1914         BB.eraseInstructionAtIndex(I);
1915     }
1916   }
1917 }
1918 
1919 Error RemoveNops::runOnFunctions(BinaryContext &BC) {
1920   ParallelUtilities::WorkFuncTy WorkFun = [&](BinaryFunction &BF) {
1921     runOnFunction(BF);
1922   };
1923 
1924   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
1925     return BF.shouldPreserveNops();
1926   };
1927 
1928   ParallelUtilities::runOnEachFunction(
1929       BC, ParallelUtilities::SchedulingPolicy::SP_INST_LINEAR, WorkFun,
1930       SkipFunc, "RemoveNops");
1931   return Error::success();
1932 }
1933 
1934 } // namespace bolt
1935 } // namespace llvm
1936