xref: /llvm-project/bolt/lib/Passes/SplitFunctions.cpp (revision 157748341358f38ab55ea3a7a64276a5d4431d77)
1 //===- bolt/Passes/SplitFunctions.cpp - Pass for splitting function code --===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the SplitFunctions pass.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "bolt/Passes/SplitFunctions.h"
14 #include "bolt/Core/BinaryBasicBlock.h"
15 #include "bolt/Core/BinaryFunction.h"
16 #include "bolt/Core/FunctionLayout.h"
17 #include "bolt/Core/ParallelUtilities.h"
18 #include "bolt/Utils/CommandLineOpts.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/Sequence.h"
21 #include "llvm/ADT/SmallVector.h"
22 #include "llvm/ADT/iterator_range.h"
23 #include "llvm/Support/CommandLine.h"
24 #include "llvm/Support/FormatVariadic.h"
25 #include <algorithm>
26 #include <iterator>
27 #include <memory>
28 #include <numeric>
29 #include <random>
30 #include <vector>
31 
32 #define DEBUG_TYPE "bolt-opts"
33 
34 using namespace llvm;
35 using namespace bolt;
36 
37 namespace {
38 class DeprecatedSplitFunctionOptionParser : public cl::parser<bool> {
39 public:
40   explicit DeprecatedSplitFunctionOptionParser(cl::Option &O)
41       : cl::parser<bool>(O) {}
42 
43   bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, bool &Value) {
44     if (Arg == "2" || Arg == "3") {
45       Value = true;
46       errs() << formatv("BOLT-WARNING: specifying non-boolean value \"{0}\" "
47                         "for option -{1} is deprecated\n",
48                         Arg, ArgName);
49       return false;
50     }
51     return cl::parser<bool>::parse(O, ArgName, Arg, Value);
52   }
53 };
54 } // namespace
55 
56 namespace opts {
57 
58 extern cl::OptionCategory BoltOptCategory;
59 
60 extern cl::opt<bool> SplitEH;
61 extern cl::opt<unsigned> ExecutionCountThreshold;
62 extern cl::opt<uint32_t> RandomSeed;
63 
64 static cl::opt<bool> AggressiveSplitting(
65     "split-all-cold", cl::desc("outline as many cold basic blocks as possible"),
66     cl::cat(BoltOptCategory));
67 
68 static cl::opt<unsigned> SplitAlignThreshold(
69     "split-align-threshold",
70     cl::desc("when deciding to split a function, apply this alignment "
71              "while doing the size comparison (see -split-threshold). "
72              "Default value: 2."),
73     cl::init(2),
74 
75     cl::Hidden, cl::cat(BoltOptCategory));
76 
77 static cl::opt<bool, false, DeprecatedSplitFunctionOptionParser>
78     SplitFunctions("split-functions",
79                    cl::desc("split functions into fragments"),
80                    cl::cat(BoltOptCategory));
81 
82 static cl::opt<unsigned> SplitThreshold(
83     "split-threshold",
84     cl::desc("split function only if its main size is reduced by more than "
85              "given amount of bytes. Default value: 0, i.e. split iff the "
86              "size is reduced. Note that on some architectures the size can "
87              "increase after splitting."),
88     cl::init(0), cl::Hidden, cl::cat(BoltOptCategory));
89 
90 static cl::opt<SplitFunctionsStrategy> SplitStrategy(
91     "split-strategy", cl::init(SplitFunctionsStrategy::Profile2),
92     cl::values(clEnumValN(SplitFunctionsStrategy::Profile2, "profile2",
93                           "split each function into a hot and cold fragment "
94                           "using profiling information")),
95     cl::values(clEnumValN(SplitFunctionsStrategy::CDSplit, "cdsplit",
96                           "split each function into a hot, warm, and cold "
97                           "fragment using profiling information")),
98     cl::values(clEnumValN(
99         SplitFunctionsStrategy::Random2, "random2",
100         "split each function into a hot and cold fragment at a randomly chosen "
101         "split point (ignoring any available profiling information)")),
102     cl::values(clEnumValN(
103         SplitFunctionsStrategy::RandomN, "randomN",
104         "split each function into N fragments at a randomly chosen split "
105         "points (ignoring any available profiling information)")),
106     cl::values(clEnumValN(
107         SplitFunctionsStrategy::All, "all",
108         "split all basic blocks of each function into fragments such that each "
109         "fragment contains exactly a single basic block")),
110     cl::desc("strategy used to partition blocks into fragments"),
111     cl::cat(BoltOptCategory));
112 
113 static cl::opt<double> CallScale(
114     "call-scale",
115     cl::desc("Call score scale coefficient (when --split-strategy=cdsplit)"),
116     cl::init(0.95), cl::ReallyHidden, cl::cat(BoltOptCategory));
117 
118 static cl::opt<double>
119     CallPower("call-power",
120               cl::desc("Call score power (when --split-strategy=cdsplit)"),
121               cl::init(0.05), cl::ReallyHidden, cl::cat(BoltOptCategory));
122 
123 static cl::opt<double>
124     JumpPower("jump-power",
125               cl::desc("Jump score power (when --split-strategy=cdsplit)"),
126               cl::init(0.15), cl::ReallyHidden, cl::cat(BoltOptCategory));
127 } // namespace opts
128 
129 namespace {
130 bool hasFullProfile(const BinaryFunction &BF) {
131   return llvm::all_of(BF.blocks(), [](const BinaryBasicBlock &BB) {
132     return BB.getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE;
133   });
134 }
135 
136 bool allBlocksCold(const BinaryFunction &BF) {
137   return llvm::all_of(BF.blocks(), [](const BinaryBasicBlock &BB) {
138     return BB.getExecutionCount() == 0;
139   });
140 }
141 
142 struct SplitProfile2 final : public SplitStrategy {
143   bool canSplit(const BinaryFunction &BF) override {
144     return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF);
145   }
146 
147   bool compactFragments() override { return true; }
148 
149   void fragment(const BlockIt Start, const BlockIt End) override {
150     for (BinaryBasicBlock *const BB : llvm::make_range(Start, End)) {
151       if (BB->getExecutionCount() == 0)
152         BB->setFragmentNum(FragmentNum::cold());
153     }
154   }
155 };
156 
157 struct SplitCacheDirected final : public SplitStrategy {
158   BinaryContext &BC;
159   using BasicBlockOrder = BinaryFunction::BasicBlockOrderType;
160 
161   bool canSplit(const BinaryFunction &BF) override {
162     return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF);
163   }
164 
165   explicit SplitCacheDirected(BinaryContext &BC) : BC(BC) {
166     initializeAuxiliaryVariables();
167     buildCallGraph();
168   }
169 
170   // When some functions are hot-warm split and others are hot-warm-cold split,
171   // we do not want to change the fragment numbers of the blocks in the hot-warm
172   // split functions.
173   bool compactFragments() override { return false; }
174 
175   void fragment(const BlockIt Start, const BlockIt End) override {
176     BasicBlockOrder BlockOrder(Start, End);
177     BinaryFunction &BF = *BlockOrder.front()->getFunction();
178     // No need to re-split small functions.
179     if (BlockOrder.size() <= 2)
180       return;
181 
182     size_t BestSplitIndex = findSplitIndex(BF, BlockOrder);
183     assert(BestSplitIndex < BlockOrder.size());
184 
185     // Assign fragments based on the computed best split index.
186     // All basic blocks with index up to the best split index become hot.
187     // All remaining blocks are warm / cold depending on if count is
188     // greater than zero or not.
189     for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
190       BinaryBasicBlock *BB = BlockOrder[Index];
191       if (Index <= BestSplitIndex)
192         BB->setFragmentNum(FragmentNum::main());
193       else
194         BB->setFragmentNum(BB->getKnownExecutionCount() > 0
195                                ? FragmentNum::warm()
196                                : FragmentNum::cold());
197     }
198   }
199 
200 private:
201   struct CallInfo {
202     size_t Length;
203     size_t Count;
204   };
205 
206   struct SplitScore {
207     size_t SplitIndex = size_t(-1);
208     size_t HotSizeReduction = 0;
209     double LocalScore = 0;
210     double CoverCallScore = 0;
211 
212     double sum() const { return LocalScore + CoverCallScore; }
213   };
214 
215   // Auxiliary variables used by the algorithm.
216   size_t TotalNumBlocks{0};
217   size_t OrigHotSectionSize{0};
218   DenseMap<const BinaryBasicBlock *, size_t> GlobalIndices;
219   DenseMap<const BinaryBasicBlock *, size_t> BBSizes;
220   DenseMap<const BinaryBasicBlock *, size_t> BBOffsets;
221 
222   // Call graph.
223   std::vector<SmallVector<const BinaryBasicBlock *, 0>> Callers;
224   std::vector<SmallVector<const BinaryBasicBlock *, 0>> Callees;
225 
226   bool shouldConsiderForCallGraph(const BinaryFunction &BF) {
227     // Only a subset of the functions in the binary will be considered
228     // for initializing auxiliary variables and building call graph.
229     return BF.hasValidIndex() && BF.hasValidProfile() && !BF.empty();
230   }
231 
232   void initializeAuxiliaryVariables() {
233     for (BinaryFunction *BF : BC.getSortedFunctions()) {
234       if (!shouldConsiderForCallGraph(*BF))
235         continue;
236 
237       // Calculate the size of each BB after hot-cold splitting.
238       // This populates BinaryBasicBlock::OutputAddressRange which
239       // can be used to compute the size of each BB.
240       BC.calculateEmittedSize(*BF, /*FixBranches=*/true);
241 
242       for (BinaryBasicBlock *BB : BF->getLayout().blocks()) {
243         // Unique global index.
244         GlobalIndices[BB] = TotalNumBlocks;
245         TotalNumBlocks++;
246 
247         // Block size after hot-cold splitting.
248         BBSizes[BB] = BB->getOutputSize();
249 
250         // Hot block offset after hot-cold splitting.
251         BBOffsets[BB] = OrigHotSectionSize;
252         if (!BB->isSplit())
253           OrigHotSectionSize += BBSizes[BB];
254       }
255     }
256   }
257 
258   void buildCallGraph() {
259     Callers.resize(TotalNumBlocks);
260     Callees.resize(TotalNumBlocks);
261     for (const BinaryFunction *SrcFunction : BC.getSortedFunctions()) {
262       if (!shouldConsiderForCallGraph(*SrcFunction))
263         continue;
264 
265       for (BinaryBasicBlock &SrcBB : SrcFunction->blocks()) {
266         // Skip blocks that are not executed
267         if (SrcBB.getKnownExecutionCount() == 0)
268           continue;
269 
270         // Find call instructions and extract target symbols from each one
271         for (const MCInst &Inst : SrcBB) {
272           if (!BC.MIB->isCall(Inst))
273             continue;
274 
275           // Call info
276           const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst);
277           // Ignore calls w/o information
278           if (!DstSym)
279             continue;
280 
281           const BinaryFunction *DstFunction = BC.getFunctionForSymbol(DstSym);
282           // Ignore calls that do not have a valid target, but do not ignore
283           // recursive calls, because caller block could be moved to warm.
284           if (!DstFunction || DstFunction->getLayout().block_empty())
285             continue;
286 
287           const BinaryBasicBlock *DstBB = &(DstFunction->front());
288 
289           // Record the call only if DstBB is also in functions to consider for
290           // call graph.
291           if (GlobalIndices.contains(DstBB)) {
292             Callers[GlobalIndices[DstBB]].push_back(&SrcBB);
293             Callees[GlobalIndices[&SrcBB]].push_back(DstBB);
294           }
295         }
296       }
297     }
298   }
299 
300   /// Populate BinaryBasicBlock::OutputAddressRange with estimated basic block
301   /// start and end addresses for hot and warm basic blocks, assuming hot-warm
302   /// splitting happens at \p SplitIndex. Also return estimated end addresses
303   /// of the hot fragment before and after splitting.
304   /// The estimations take into account the potential addition of branch
305   /// instructions due to split fall through branches as well as the need to
306   /// use longer branch instructions for split (un)conditional branches.
307   std::pair<size_t, size_t>
308   estimatePostSplitBBAddress(const BasicBlockOrder &BlockOrder,
309                              const size_t SplitIndex) {
310     assert(SplitIndex < BlockOrder.size() && "Invalid split index");
311 
312     // Update function layout assuming hot-warm splitting at SplitIndex.
313     for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
314       BinaryBasicBlock *BB = BlockOrder[Index];
315       if (BB->getFragmentNum() == FragmentNum::cold())
316         break;
317       BB->setFragmentNum(Index <= SplitIndex ? FragmentNum::main()
318                                              : FragmentNum::warm());
319     }
320     BinaryFunction *BF = BlockOrder[0]->getFunction();
321     BF->getLayout().update(BlockOrder);
322     // Populate BB.OutputAddressRange under the updated layout.
323     BC.calculateEmittedSize(*BF);
324 
325     // Populate BB.OutputAddressRange with estimated new start and end addresses
326     // and compute the old end address of the hot section and the new end
327     // address of the hot section.
328     size_t OldHotEndAddr{0};
329     size_t NewHotEndAddr{0};
330     size_t CurrentAddr = BBOffsets[BlockOrder[0]];
331     for (BinaryBasicBlock *BB : BlockOrder) {
332       // We only care about new addresses of blocks in hot/warm.
333       if (BB->getFragmentNum() == FragmentNum::cold())
334         break;
335       const size_t NewSize = BB->getOutputSize();
336       BB->setOutputStartAddress(CurrentAddr);
337       CurrentAddr += NewSize;
338       BB->setOutputEndAddress(CurrentAddr);
339       if (BB->getLayoutIndex() == SplitIndex) {
340         NewHotEndAddr = CurrentAddr;
341         // Approximate the start address of the warm fragment of the current
342         // function using the original hot section size.
343         CurrentAddr = OrigHotSectionSize;
344       }
345       OldHotEndAddr = BBOffsets[BB] + BBSizes[BB];
346     }
347     return std::make_pair(OldHotEndAddr, NewHotEndAddr);
348   }
349 
350   /// Get a collection of "shortenable" calls, that is, calls of type X->Y
351   /// when the function order is [... X ... BF ... Y ...].
352   /// If the hot fragment size of BF is reduced, then such calls are guaranteed
353   /// to get shorter by the reduced hot fragment size.
354   std::vector<CallInfo> extractCoverCalls(const BinaryFunction &BF) {
355     // Record the length and the count of the calls that can be shortened
356     std::vector<CallInfo> CoverCalls;
357     if (opts::CallScale == 0)
358       return CoverCalls;
359 
360     const BinaryFunction *ThisBF = &BF;
361     const BinaryBasicBlock *ThisBB = &(ThisBF->front());
362     const size_t ThisGI = GlobalIndices[ThisBB];
363 
364     for (const BinaryFunction *DstBF : BC.getSortedFunctions()) {
365       if (!shouldConsiderForCallGraph(*DstBF))
366         continue;
367 
368       const BinaryBasicBlock *DstBB = &(DstBF->front());
369       if (DstBB->getKnownExecutionCount() == 0)
370         continue;
371 
372       const size_t DstGI = GlobalIndices[DstBB];
373       for (const BinaryBasicBlock *SrcBB : Callers[DstGI]) {
374         const BinaryFunction *SrcBF = SrcBB->getFunction();
375         if (ThisBF == SrcBF)
376           continue;
377 
378         const size_t CallCount = SrcBB->getKnownExecutionCount();
379 
380         const size_t SrcGI = GlobalIndices[SrcBB];
381 
382         const bool IsCoverCall = (SrcGI < ThisGI && ThisGI < DstGI) ||
383                                  (DstGI <= ThisGI && ThisGI < SrcGI);
384         if (!IsCoverCall)
385           continue;
386 
387         const size_t SrcBBEndAddr = BBOffsets[SrcBB] + BBSizes[SrcBB];
388         const size_t DstBBStartAddr = BBOffsets[DstBB];
389         const size_t CallLength =
390             AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr);
391         const CallInfo CI{CallLength, CallCount};
392         CoverCalls.emplace_back(CI);
393       }
394     }
395     return CoverCalls;
396   }
397 
398   /// Compute the edge score of a call edge.
399   double computeCallScore(uint64_t CallCount, size_t CallLength) {
400     // Increase call lengths by 1 to avoid raising 0 to a negative power.
401     return opts::CallScale * static_cast<double>(CallCount) /
402            std::pow(static_cast<double>(CallLength + 1), opts::CallPower);
403   }
404 
405   /// Compute the edge score of a jump (branch) edge.
406   double computeJumpScore(uint64_t JumpCount, size_t JumpLength) {
407     // Increase jump lengths by 1 to avoid raising 0 to a negative power.
408     return static_cast<double>(JumpCount) /
409            std::pow(static_cast<double>(JumpLength + 1), opts::JumpPower);
410   }
411 
412   /// Compute sum of scores over jumps within \p BlockOrder given \p SplitIndex.
413   /// Increament Score.LocalScore in place by the sum.
414   void computeJumpScore(const BasicBlockOrder &BlockOrder,
415                         const size_t SplitIndex, SplitScore &Score) {
416 
417     for (const BinaryBasicBlock *SrcBB : BlockOrder) {
418       if (SrcBB->getKnownExecutionCount() == 0)
419         continue;
420 
421       const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second;
422 
423       for (const auto Pair : zip(SrcBB->successors(), SrcBB->branch_info())) {
424         const BinaryBasicBlock *DstBB = std::get<0>(Pair);
425         const BinaryBasicBlock::BinaryBranchInfo &Branch = std::get<1>(Pair);
426         const size_t JumpCount = Branch.Count;
427 
428         if (JumpCount == 0)
429           continue;
430 
431         const size_t DstBBStartAddr = DstBB->getOutputAddressRange().first;
432         const size_t NewJumpLength =
433             AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr);
434         Score.LocalScore += computeJumpScore(JumpCount, NewJumpLength);
435       }
436     }
437   }
438 
439   /// Compute sum of scores over calls originated in the current function
440   /// given \p SplitIndex. Increament Score.LocalScore in place by the sum.
441   void computeLocalCallScore(const BasicBlockOrder &BlockOrder,
442                              const size_t SplitIndex, SplitScore &Score) {
443     if (opts::CallScale == 0)
444       return;
445 
446     // Global index of the last block in the current function.
447     // This is later used to determine whether a call originated in the current
448     // function is to a function that comes after the current function.
449     const size_t LastGlobalIndex = GlobalIndices[BlockOrder.back()];
450 
451     // The length of calls originated in the input function can increase /
452     // decrease depending on the splitting decision.
453     for (const BinaryBasicBlock *SrcBB : BlockOrder) {
454       const size_t CallCount = SrcBB->getKnownExecutionCount();
455       // If SrcBB does not call any functions, skip it.
456       if (CallCount == 0)
457         continue;
458 
459       // Obtain an estimate on the end address of the src basic block
460       // after splitting at SplitIndex.
461       const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second;
462 
463       for (const BinaryBasicBlock *DstBB : Callees[GlobalIndices[SrcBB]]) {
464         // Obtain an estimate on the start address of the dst basic block
465         // after splitting at SplitIndex. If DstBB is in a function before
466         // the current function, then its start address remains unchanged.
467         size_t DstBBStartAddr = BBOffsets[DstBB];
468         // If DstBB is in a function after the current function, then its
469         // start address should be adjusted based on the reduction in hot size.
470         if (GlobalIndices[DstBB] > LastGlobalIndex) {
471           assert(DstBBStartAddr >= Score.HotSizeReduction);
472           DstBBStartAddr -= Score.HotSizeReduction;
473         }
474         const size_t NewCallLength =
475             AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr);
476         Score.LocalScore += computeCallScore(CallCount, NewCallLength);
477       }
478     }
479   }
480 
481   /// Compute sum of splitting scores for cover calls of the input function.
482   /// Increament Score.CoverCallScore in place by the sum.
483   void computeCoverCallScore(const BasicBlockOrder &BlockOrder,
484                              const size_t SplitIndex,
485                              const std::vector<CallInfo> &CoverCalls,
486                              SplitScore &Score) {
487     if (opts::CallScale == 0)
488       return;
489 
490     for (const CallInfo CI : CoverCalls) {
491       assert(CI.Length >= Score.HotSizeReduction &&
492              "Length of cover calls must exceed reduced size of hot fragment.");
493       // Compute the new length of the call, which is shorter than the original
494       // one by the size of the splitted fragment minus the total size increase.
495       const size_t NewCallLength = CI.Length - Score.HotSizeReduction;
496       Score.CoverCallScore += computeCallScore(CI.Count, NewCallLength);
497     }
498   }
499 
500   /// Compute the split score of splitting a function at a given index.
501   /// The split score consists of local score and cover score. This function
502   /// returns \p Score of SplitScore type. It contains the local score and
503   /// cover score of the current splitting index. For easier book keeping and
504   /// comparison, it also stores the split index and the resulting reduction
505   /// in hot fragment size.
506   SplitScore computeSplitScore(const BinaryFunction &BF,
507                                const BasicBlockOrder &BlockOrder,
508                                const size_t SplitIndex,
509                                const std::vector<CallInfo> &CoverCalls) {
510     // Populate BinaryBasicBlock::OutputAddressRange with estimated
511     // new start and end addresses after hot-warm splitting at SplitIndex.
512     size_t OldHotEnd;
513     size_t NewHotEnd;
514     std::tie(OldHotEnd, NewHotEnd) =
515         estimatePostSplitBBAddress(BlockOrder, SplitIndex);
516 
517     SplitScore Score;
518     Score.SplitIndex = SplitIndex;
519 
520     // It's not worth splitting if OldHotEnd < NewHotEnd.
521     if (OldHotEnd < NewHotEnd)
522       return Score;
523 
524     // Hot fragment size reduction due to splitting.
525     Score.HotSizeReduction = OldHotEnd - NewHotEnd;
526 
527     // First part of LocalScore is the sum over call edges originated in the
528     // input function. These edges can get shorter or longer depending on
529     // SplitIndex. Score.LocalScore is increamented in place.
530     computeLocalCallScore(BlockOrder, SplitIndex, Score);
531 
532     // Second part of LocalScore is the sum over jump edges with src basic block
533     // and dst basic block in the current function. Score.LocalScore is
534     // increamented in place.
535     computeJumpScore(BlockOrder, SplitIndex, Score);
536 
537     // Compute CoverCallScore and store in Score in place.
538     computeCoverCallScore(BlockOrder, SplitIndex, CoverCalls, Score);
539     return Score;
540   }
541 
542   /// Find the most likely successor of a basic block when it has one or two
543   /// successors. Return nullptr otherwise.
544   const BinaryBasicBlock *getMostLikelySuccessor(const BinaryBasicBlock *BB) {
545     if (BB->succ_size() == 1)
546       return BB->getSuccessor();
547     if (BB->succ_size() == 2) {
548       uint64_t TakenCount = BB->getTakenBranchInfo().Count;
549       assert(TakenCount != BinaryBasicBlock::COUNT_NO_PROFILE);
550       uint64_t NonTakenCount = BB->getFallthroughBranchInfo().Count;
551       assert(NonTakenCount != BinaryBasicBlock::COUNT_NO_PROFILE);
552       if (TakenCount > NonTakenCount)
553         return BB->getConditionalSuccessor(true);
554       else if (TakenCount < NonTakenCount)
555         return BB->getConditionalSuccessor(false);
556     }
557     return nullptr;
558   }
559 
560   /// Find the best index for splitting. The returned value is the index of the
561   /// last hot basic block. Hence, "no splitting" is equivalent to returning the
562   /// value which is one less than the size of the function.
563   size_t findSplitIndex(const BinaryFunction &BF,
564                         const BasicBlockOrder &BlockOrder) {
565     assert(BlockOrder.size() > 2);
566     // Find all function calls that can be shortened if we move blocks of the
567     // current function to warm/cold
568     const std::vector<CallInfo> CoverCalls = extractCoverCalls(BF);
569 
570     // Find the existing hot-cold splitting index.
571     size_t HotColdIndex = 0;
572     while (HotColdIndex + 1 < BlockOrder.size()) {
573       if (BlockOrder[HotColdIndex + 1]->getFragmentNum() == FragmentNum::cold())
574         break;
575       HotColdIndex++;
576     }
577     assert(HotColdIndex + 1 == BlockOrder.size() ||
578            (BlockOrder[HotColdIndex]->getFragmentNum() == FragmentNum::main() &&
579             BlockOrder[HotColdIndex + 1]->getFragmentNum() ==
580                 FragmentNum::cold()));
581 
582     // Try all possible split indices up to HotColdIndex (blocks that have
583     // Index <= SplitIndex are in hot) and find the one maximizing the
584     // splitting score.
585     SplitScore BestScore;
586     for (size_t Index = 0; Index <= HotColdIndex; Index++) {
587       const BinaryBasicBlock *LastHotBB = BlockOrder[Index];
588       assert(LastHotBB->getFragmentNum() != FragmentNum::cold());
589 
590       // Do not break jump to the most likely successor.
591       if (Index + 1 < BlockOrder.size() &&
592           BlockOrder[Index + 1] == getMostLikelySuccessor(LastHotBB))
593         continue;
594 
595       const SplitScore Score =
596           computeSplitScore(BF, BlockOrder, Index, CoverCalls);
597       if (Score.sum() > BestScore.sum())
598         BestScore = Score;
599     }
600 
601     // If we don't find a good splitting point, fallback to the original one.
602     if (BestScore.SplitIndex == size_t(-1))
603       return HotColdIndex;
604 
605     return BestScore.SplitIndex;
606   }
607 };
608 
609 struct SplitRandom2 final : public SplitStrategy {
610   std::minstd_rand0 Gen;
611 
612   SplitRandom2() : Gen(opts::RandomSeed.getValue()) {}
613 
614   bool canSplit(const BinaryFunction &BF) override { return true; }
615 
616   bool compactFragments() override { return true; }
617 
618   void fragment(const BlockIt Start, const BlockIt End) override {
619     using DiffT = typename std::iterator_traits<BlockIt>::difference_type;
620     const DiffT NumBlocks = End - Start;
621     assert(NumBlocks > 0 && "Cannot fragment empty function");
622 
623     // We want to split at least one block
624     const auto LastSplitPoint = std::max<DiffT>(NumBlocks - 1, 1);
625     std::uniform_int_distribution<DiffT> Dist(1, LastSplitPoint);
626     const DiffT SplitPoint = Dist(Gen);
627     for (BinaryBasicBlock *BB : llvm::make_range(Start + SplitPoint, End))
628       BB->setFragmentNum(FragmentNum::cold());
629 
630     LLVM_DEBUG(dbgs() << formatv("BOLT-DEBUG: randomly chose last {0} (out of "
631                                  "{1} possible) blocks to split\n",
632                                  NumBlocks - SplitPoint, End - Start));
633   }
634 };
635 
636 struct SplitRandomN final : public SplitStrategy {
637   std::minstd_rand0 Gen;
638 
639   SplitRandomN() : Gen(opts::RandomSeed.getValue()) {}
640 
641   bool canSplit(const BinaryFunction &BF) override { return true; }
642 
643   bool compactFragments() override { return true; }
644 
645   void fragment(const BlockIt Start, const BlockIt End) override {
646     using DiffT = typename std::iterator_traits<BlockIt>::difference_type;
647     const DiffT NumBlocks = End - Start;
648     assert(NumBlocks > 0 && "Cannot fragment empty function");
649 
650     // With n blocks, there are n-1 places to split them.
651     const DiffT MaximumSplits = NumBlocks - 1;
652     // We want to generate at least two fragment if possible, but if there is
653     // only one block, no splits are possible.
654     const auto MinimumSplits = std::min<DiffT>(MaximumSplits, 1);
655     std::uniform_int_distribution<DiffT> Dist(MinimumSplits, MaximumSplits);
656     // Choose how many splits to perform
657     const DiffT NumSplits = Dist(Gen);
658 
659     // Draw split points from a lottery
660     SmallVector<unsigned, 0> Lottery(MaximumSplits);
661     // Start lottery at 1, because there is no meaningful splitpoint before the
662     // first block.
663     std::iota(Lottery.begin(), Lottery.end(), 1u);
664     std::shuffle(Lottery.begin(), Lottery.end(), Gen);
665     Lottery.resize(NumSplits);
666     llvm::sort(Lottery);
667 
668     // Add one past the end entry to lottery
669     Lottery.push_back(NumBlocks);
670 
671     unsigned LotteryIndex = 0;
672     unsigned BBPos = 0;
673     for (BinaryBasicBlock *const BB : make_range(Start, End)) {
674       // Check whether to start new fragment
675       if (BBPos >= Lottery[LotteryIndex])
676         ++LotteryIndex;
677 
678       // Because LotteryIndex is 0 based and cold fragments are 1 based, we can
679       // use the index to assign fragments.
680       BB->setFragmentNum(FragmentNum(LotteryIndex));
681 
682       ++BBPos;
683     }
684   }
685 };
686 
687 struct SplitAll final : public SplitStrategy {
688   bool canSplit(const BinaryFunction &BF) override { return true; }
689 
690   bool compactFragments() override {
691     // Keeping empty fragments allows us to test, that empty fragments do not
692     // generate symbols.
693     return false;
694   }
695 
696   void fragment(const BlockIt Start, const BlockIt End) override {
697     unsigned Fragment = 0;
698     for (BinaryBasicBlock *const BB : llvm::make_range(Start, End))
699       BB->setFragmentNum(FragmentNum(Fragment++));
700   }
701 };
702 } // namespace
703 
704 namespace llvm {
705 namespace bolt {
706 
707 bool SplitFunctions::shouldOptimize(const BinaryFunction &BF) const {
708   // Apply execution count threshold
709   if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
710     return false;
711 
712   return BinaryFunctionPass::shouldOptimize(BF);
713 }
714 
715 void SplitFunctions::runOnFunctions(BinaryContext &BC) {
716   if (!opts::SplitFunctions)
717     return;
718 
719   // If split strategy is not CDSplit, then a second run of the pass is not
720   // needed after function reordering.
721   if (BC.HasFinalizedFunctionOrder &&
722       opts::SplitStrategy != SplitFunctionsStrategy::CDSplit)
723     return;
724 
725   std::unique_ptr<SplitStrategy> Strategy;
726   bool ForceSequential = false;
727 
728   switch (opts::SplitStrategy) {
729   case SplitFunctionsStrategy::CDSplit:
730     // CDSplit runs two splitting passes: hot-cold splitting (SplitPrfoile2)
731     // before function reordering and hot-warm-cold splitting
732     // (SplitCacheDirected) after function reordering.
733     if (BC.HasFinalizedFunctionOrder)
734       Strategy = std::make_unique<SplitCacheDirected>(BC);
735     else
736       Strategy = std::make_unique<SplitProfile2>();
737     opts::AggressiveSplitting = true;
738     BC.HasWarmSection = true;
739     break;
740   case SplitFunctionsStrategy::Profile2:
741     Strategy = std::make_unique<SplitProfile2>();
742     break;
743   case SplitFunctionsStrategy::Random2:
744     Strategy = std::make_unique<SplitRandom2>();
745     // If we split functions randomly, we need to ensure that across runs with
746     // the same input, we generate random numbers for each function in the same
747     // order.
748     ForceSequential = true;
749     break;
750   case SplitFunctionsStrategy::RandomN:
751     Strategy = std::make_unique<SplitRandomN>();
752     ForceSequential = true;
753     break;
754   case SplitFunctionsStrategy::All:
755     Strategy = std::make_unique<SplitAll>();
756     break;
757   }
758 
759   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
760     return !shouldOptimize(BF);
761   };
762 
763   ParallelUtilities::runOnEachFunction(
764       BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR,
765       [&](BinaryFunction &BF) { splitFunction(BF, *Strategy); }, SkipFunc,
766       "SplitFunctions", ForceSequential);
767 
768   if (SplitBytesHot + SplitBytesCold > 0)
769     outs() << "BOLT-INFO: splitting separates " << SplitBytesHot
770            << " hot bytes from " << SplitBytesCold << " cold bytes "
771            << format("(%.2lf%% of split functions is hot).\n",
772                      100.0 * SplitBytesHot / (SplitBytesHot + SplitBytesCold));
773 }
774 
775 void SplitFunctions::splitFunction(BinaryFunction &BF, SplitStrategy &S) {
776   if (BF.empty())
777     return;
778 
779   if (!S.canSplit(BF))
780     return;
781 
782   FunctionLayout &Layout = BF.getLayout();
783   BinaryFunction::BasicBlockOrderType PreSplitLayout(Layout.block_begin(),
784                                                      Layout.block_end());
785 
786   BinaryContext &BC = BF.getBinaryContext();
787   size_t OriginalHotSize;
788   size_t HotSize;
789   size_t ColdSize;
790   if (BC.isX86()) {
791     std::tie(OriginalHotSize, ColdSize) = BC.calculateEmittedSize(BF);
792     LLVM_DEBUG(dbgs() << "Estimated size for function " << BF
793                       << " pre-split is <0x"
794                       << Twine::utohexstr(OriginalHotSize) << ", 0x"
795                       << Twine::utohexstr(ColdSize) << ">\n");
796   }
797 
798   BinaryFunction::BasicBlockOrderType NewLayout(Layout.block_begin(),
799                                                 Layout.block_end());
800   // Never outline the first basic block.
801   NewLayout.front()->setCanOutline(false);
802   for (BinaryBasicBlock *const BB : NewLayout) {
803     if (!BB->canOutline())
804       continue;
805 
806     // Do not split extra entry points in aarch64. They can be referred by
807     // using ADRs and when this happens, these blocks cannot be placed far
808     // away due to the limited range in ADR instruction.
809     if (BC.isAArch64() && BB->isEntryPoint()) {
810       BB->setCanOutline(false);
811       continue;
812     }
813 
814     if (BF.hasEHRanges() && !opts::SplitEH) {
815       // We cannot move landing pads (or rather entry points for landing pads).
816       if (BB->isLandingPad()) {
817         BB->setCanOutline(false);
818         continue;
819       }
820       // We cannot move a block that can throw since exception-handling
821       // runtime cannot deal with split functions. However, if we can guarantee
822       // that the block never throws, it is safe to move the block to
823       // decrease the size of the function.
824       for (MCInst &Instr : *BB) {
825         if (BC.MIB->isInvoke(Instr)) {
826           BB->setCanOutline(false);
827           break;
828         }
829       }
830     }
831   }
832 
833   BF.getLayout().updateLayoutIndices();
834   S.fragment(NewLayout.begin(), NewLayout.end());
835 
836   // Make sure all non-outlineable blocks are in the main-fragment.
837   for (BinaryBasicBlock *const BB : NewLayout) {
838     if (!BB->canOutline())
839       BB->setFragmentNum(FragmentNum::main());
840   }
841 
842   if (opts::AggressiveSplitting) {
843     // All blocks with 0 count that we can move go to the end of the function.
844     // Even if they were natural to cluster formation and were seen in-between
845     // hot basic blocks.
846     llvm::stable_sort(NewLayout, [&](const BinaryBasicBlock *const A,
847                                      const BinaryBasicBlock *const B) {
848       return A->getFragmentNum() < B->getFragmentNum();
849     });
850   } else if (BF.hasEHRanges() && !opts::SplitEH) {
851     // Typically functions with exception handling have landing pads at the end.
852     // We cannot move beginning of landing pads, but we can move 0-count blocks
853     // comprising landing pads to the end and thus facilitate splitting.
854     auto FirstLP = NewLayout.begin();
855     while ((*FirstLP)->isLandingPad())
856       ++FirstLP;
857 
858     std::stable_sort(FirstLP, NewLayout.end(),
859                      [&](BinaryBasicBlock *A, BinaryBasicBlock *B) {
860                        return A->getFragmentNum() < B->getFragmentNum();
861                      });
862   }
863 
864   // Make sure that fragments are increasing.
865   FragmentNum CurrentFragment = NewLayout.back()->getFragmentNum();
866   for (BinaryBasicBlock *const BB : reverse(NewLayout)) {
867     if (BB->getFragmentNum() > CurrentFragment)
868       BB->setFragmentNum(CurrentFragment);
869     CurrentFragment = BB->getFragmentNum();
870   }
871 
872   if (S.compactFragments()) {
873     FragmentNum CurrentFragment = FragmentNum::main();
874     FragmentNum NewFragment = FragmentNum::main();
875     for (BinaryBasicBlock *const BB : NewLayout) {
876       if (BB->getFragmentNum() > CurrentFragment) {
877         CurrentFragment = BB->getFragmentNum();
878         NewFragment = FragmentNum(NewFragment.get() + 1);
879       }
880       BB->setFragmentNum(NewFragment);
881     }
882   }
883 
884   const bool LayoutUpdated = BF.getLayout().update(NewLayout);
885 
886   // For shared objects, invoke instructions and corresponding landing pads
887   // have to be placed in the same fragment. When we split them, create
888   // trampoline landing pads that will redirect the execution to real LPs.
889   TrampolineSetType Trampolines;
890   if (!BC.HasFixedLoadAddress && BF.hasEHRanges() && BF.isSplit())
891     Trampolines = createEHTrampolines(BF);
892 
893   // Check the new size to see if it's worth splitting the function.
894   if (BC.isX86() && LayoutUpdated) {
895     std::tie(HotSize, ColdSize) = BC.calculateEmittedSize(BF);
896     LLVM_DEBUG(dbgs() << "Estimated size for function " << BF
897                       << " post-split is <0x" << Twine::utohexstr(HotSize)
898                       << ", 0x" << Twine::utohexstr(ColdSize) << ">\n");
899     if (alignTo(OriginalHotSize, opts::SplitAlignThreshold) <=
900         alignTo(HotSize, opts::SplitAlignThreshold) + opts::SplitThreshold) {
901       if (opts::Verbosity >= 2) {
902         outs() << "BOLT-INFO: Reversing splitting of function "
903                << formatv("{0}:\n  {1:x}, {2:x} -> {3:x}\n", BF, HotSize,
904                           ColdSize, OriginalHotSize);
905       }
906 
907       // Reverse the action of createEHTrampolines(). The trampolines will be
908       // placed immediately before the matching destination resulting in no
909       // extra code.
910       if (PreSplitLayout.size() != BF.size())
911         PreSplitLayout = mergeEHTrampolines(BF, PreSplitLayout, Trampolines);
912 
913       for (BinaryBasicBlock &BB : BF)
914         BB.setFragmentNum(FragmentNum::main());
915       BF.getLayout().update(PreSplitLayout);
916     } else {
917       SplitBytesHot += HotSize;
918       SplitBytesCold += ColdSize;
919     }
920   }
921 
922   // Fix branches if the splitting decision of the pass after function
923   // reordering is different from that of the pass before function reordering.
924   if (LayoutUpdated && BC.HasFinalizedFunctionOrder)
925     BF.fixBranches();
926 }
927 
928 SplitFunctions::TrampolineSetType
929 SplitFunctions::createEHTrampolines(BinaryFunction &BF) const {
930   const auto &MIB = BF.getBinaryContext().MIB;
931 
932   // Map real landing pads to the corresponding trampolines.
933   TrampolineSetType LPTrampolines;
934 
935   // Iterate over the copy of basic blocks since we are adding new blocks to the
936   // function which will invalidate its iterators.
937   std::vector<BinaryBasicBlock *> Blocks(BF.pbegin(), BF.pend());
938   for (BinaryBasicBlock *BB : Blocks) {
939     for (MCInst &Instr : *BB) {
940       const std::optional<MCPlus::MCLandingPad> EHInfo = MIB->getEHInfo(Instr);
941       if (!EHInfo || !EHInfo->first)
942         continue;
943 
944       const MCSymbol *LPLabel = EHInfo->first;
945       BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(LPLabel);
946       if (BB->getFragmentNum() == LPBlock->getFragmentNum())
947         continue;
948 
949       const MCSymbol *TrampolineLabel = nullptr;
950       const TrampolineKey Key(BB->getFragmentNum(), LPLabel);
951       auto Iter = LPTrampolines.find(Key);
952       if (Iter != LPTrampolines.end()) {
953         TrampolineLabel = Iter->second;
954       } else {
955         // Create a trampoline basic block in the same fragment as the thrower.
956         // Note: there's no need to insert the jump instruction, it will be
957         // added by fixBranches().
958         BinaryBasicBlock *TrampolineBB = BF.addBasicBlock();
959         TrampolineBB->setFragmentNum(BB->getFragmentNum());
960         TrampolineBB->setExecutionCount(LPBlock->getExecutionCount());
961         TrampolineBB->addSuccessor(LPBlock, TrampolineBB->getExecutionCount());
962         TrampolineBB->setCFIState(LPBlock->getCFIState());
963         TrampolineLabel = TrampolineBB->getLabel();
964         LPTrampolines.insert(std::make_pair(Key, TrampolineLabel));
965       }
966 
967       // Substitute the landing pad with the trampoline.
968       MIB->updateEHInfo(Instr,
969                         MCPlus::MCLandingPad(TrampolineLabel, EHInfo->second));
970     }
971   }
972 
973   if (LPTrampolines.empty())
974     return LPTrampolines;
975 
976   // All trampoline blocks were added to the end of the function. Place them at
977   // the end of corresponding fragments.
978   BinaryFunction::BasicBlockOrderType NewLayout(BF.getLayout().block_begin(),
979                                                 BF.getLayout().block_end());
980   stable_sort(NewLayout, [&](BinaryBasicBlock *A, BinaryBasicBlock *B) {
981     return A->getFragmentNum() < B->getFragmentNum();
982   });
983   BF.getLayout().update(NewLayout);
984 
985   // Conservatively introduce branch instructions.
986   BF.fixBranches();
987 
988   // Update exception-handling CFG for the function.
989   BF.recomputeLandingPads();
990 
991   return LPTrampolines;
992 }
993 
994 SplitFunctions::BasicBlockOrderType SplitFunctions::mergeEHTrampolines(
995     BinaryFunction &BF, SplitFunctions::BasicBlockOrderType &Layout,
996     const SplitFunctions::TrampolineSetType &Trampolines) const {
997   DenseMap<const MCSymbol *, SmallVector<const MCSymbol *, 0>>
998       IncomingTrampolines;
999   for (const auto &Entry : Trampolines) {
1000     IncomingTrampolines[Entry.getFirst().Target].emplace_back(
1001         Entry.getSecond());
1002   }
1003 
1004   BasicBlockOrderType MergedLayout;
1005   for (BinaryBasicBlock *BB : Layout) {
1006     auto Iter = IncomingTrampolines.find(BB->getLabel());
1007     if (Iter != IncomingTrampolines.end()) {
1008       for (const MCSymbol *const Trampoline : Iter->getSecond()) {
1009         BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(Trampoline);
1010         assert(LPBlock && "Could not find matching landing pad block.");
1011         MergedLayout.push_back(LPBlock);
1012       }
1013     }
1014     MergedLayout.push_back(BB);
1015   }
1016 
1017   return MergedLayout;
1018 }
1019 
1020 } // namespace bolt
1021 } // namespace llvm
1022