xref: /llvm-project/bolt/lib/Passes/SplitFunctions.cpp (revision 92301180f7e2d240c560f621f6fc1b07217cac01)
1 //===- bolt/Passes/SplitFunctions.cpp - Pass for splitting function code --===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the SplitFunctions pass.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "bolt/Passes/SplitFunctions.h"
14 #include "bolt/Core/BinaryBasicBlock.h"
15 #include "bolt/Core/BinaryFunction.h"
16 #include "bolt/Core/FunctionLayout.h"
17 #include "bolt/Core/ParallelUtilities.h"
18 #include "bolt/Utils/CommandLineOpts.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/iterator_range.h"
22 #include "llvm/Support/CommandLine.h"
23 #include "llvm/Support/FormatVariadic.h"
24 #include <algorithm>
25 #include <iterator>
26 #include <memory>
27 #include <numeric>
28 #include <random>
29 #include <vector>
30 
31 #define DEBUG_TYPE "bolt-opts"
32 
33 using namespace llvm;
34 using namespace bolt;
35 
36 namespace {
37 class DeprecatedSplitFunctionOptionParser : public cl::parser<bool> {
38 public:
39   explicit DeprecatedSplitFunctionOptionParser(cl::Option &O)
40       : cl::parser<bool>(O) {}
41 
42   bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, bool &Value) {
43     if (Arg == "2" || Arg == "3") {
44       Value = true;
45       errs() << formatv("BOLT-WARNING: specifying non-boolean value \"{0}\" "
46                         "for option -{1} is deprecated\n",
47                         Arg, ArgName);
48       return false;
49     }
50     return cl::parser<bool>::parse(O, ArgName, Arg, Value);
51   }
52 };
53 } // namespace
54 
55 namespace opts {
56 
57 extern cl::OptionCategory BoltOptCategory;
58 
59 extern cl::opt<bool> SplitEH;
60 extern cl::opt<unsigned> ExecutionCountThreshold;
61 extern cl::opt<uint32_t> RandomSeed;
62 
63 static cl::opt<bool> AggressiveSplitting(
64     "split-all-cold", cl::desc("outline as many cold basic blocks as possible"),
65     cl::cat(BoltOptCategory));
66 
67 static cl::opt<unsigned> SplitAlignThreshold(
68     "split-align-threshold",
69     cl::desc("when deciding to split a function, apply this alignment "
70              "while doing the size comparison (see -split-threshold). "
71              "Default value: 2."),
72     cl::init(2),
73 
74     cl::Hidden, cl::cat(BoltOptCategory));
75 
76 static cl::opt<bool, false, DeprecatedSplitFunctionOptionParser>
77     SplitFunctions("split-functions",
78                    cl::desc("split functions into fragments"),
79                    cl::cat(BoltOptCategory));
80 
81 static cl::opt<unsigned> SplitThreshold(
82     "split-threshold",
83     cl::desc("split function only if its main size is reduced by more than "
84              "given amount of bytes. Default value: 0, i.e. split iff the "
85              "size is reduced. Note that on some architectures the size can "
86              "increase after splitting."),
87     cl::init(0), cl::Hidden, cl::cat(BoltOptCategory));
88 
89 static cl::opt<SplitFunctionsStrategy> SplitStrategy(
90     "split-strategy", cl::init(SplitFunctionsStrategy::Profile2),
91     cl::values(clEnumValN(SplitFunctionsStrategy::Profile2, "profile2",
92                           "split each function into a hot and cold fragment "
93                           "using profiling information")),
94     cl::values(clEnumValN(SplitFunctionsStrategy::CDSplit, "cdsplit",
95                           "split each function into a hot, warm, and cold "
96                           "fragment using profiling information")),
97     cl::values(clEnumValN(
98         SplitFunctionsStrategy::Random2, "random2",
99         "split each function into a hot and cold fragment at a randomly chosen "
100         "split point (ignoring any available profiling information)")),
101     cl::values(clEnumValN(
102         SplitFunctionsStrategy::RandomN, "randomN",
103         "split each function into N fragments at a randomly chosen split "
104         "points (ignoring any available profiling information)")),
105     cl::values(clEnumValN(
106         SplitFunctionsStrategy::All, "all",
107         "split all basic blocks of each function into fragments such that each "
108         "fragment contains exactly a single basic block")),
109     cl::desc("strategy used to partition blocks into fragments"),
110     cl::cat(BoltOptCategory));
111 
112 static cl::opt<double> CallScale(
113     "call-scale",
114     cl::desc("Call score scale coefficient (when --split-strategy=cdsplit)"),
115     cl::init(0.95), cl::ReallyHidden, cl::cat(BoltOptCategory));
116 
117 static cl::opt<double>
118     CallPower("call-power",
119               cl::desc("Call score power (when --split-strategy=cdsplit)"),
120               cl::init(0.05), cl::ReallyHidden, cl::cat(BoltOptCategory));
121 
122 static cl::opt<double>
123     JumpPower("jump-power",
124               cl::desc("Jump score power (when --split-strategy=cdsplit)"),
125               cl::init(0.15), cl::ReallyHidden, cl::cat(BoltOptCategory));
126 } // namespace opts
127 
128 namespace {
129 bool hasFullProfile(const BinaryFunction &BF) {
130   return llvm::all_of(BF.blocks(), [](const BinaryBasicBlock &BB) {
131     return BB.getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE;
132   });
133 }
134 
135 bool allBlocksCold(const BinaryFunction &BF) {
136   return llvm::all_of(BF.blocks(), [](const BinaryBasicBlock &BB) {
137     return BB.getExecutionCount() == 0;
138   });
139 }
140 
141 struct SplitProfile2 final : public SplitStrategy {
142   bool canSplit(const BinaryFunction &BF) override {
143     return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF);
144   }
145 
146   bool compactFragments() override { return true; }
147 
148   void fragment(const BlockIt Start, const BlockIt End) override {
149     for (BinaryBasicBlock *const BB : llvm::make_range(Start, End)) {
150       if (BB->getExecutionCount() == 0)
151         BB->setFragmentNum(FragmentNum::cold());
152     }
153   }
154 };
155 
156 struct SplitCacheDirected final : public SplitStrategy {
157   BinaryContext &BC;
158   using BasicBlockOrder = BinaryFunction::BasicBlockOrderType;
159 
160   bool canSplit(const BinaryFunction &BF) override {
161     return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF);
162   }
163 
164   explicit SplitCacheDirected(BinaryContext &BC) : BC(BC) {
165     initializeAuxiliaryVariables();
166     buildCallGraph();
167   }
168 
169   // When some functions are hot-warm split and others are hot-warm-cold split,
170   // we do not want to change the fragment numbers of the blocks in the hot-warm
171   // split functions.
172   bool compactFragments() override { return false; }
173 
174   void fragment(const BlockIt Start, const BlockIt End) override {
175     BasicBlockOrder BlockOrder(Start, End);
176     BinaryFunction &BF = *BlockOrder.front()->getFunction();
177     // No need to re-split small functions.
178     if (BlockOrder.size() <= 2)
179       return;
180 
181     size_t BestSplitIndex = findSplitIndex(BF, BlockOrder);
182     assert(BestSplitIndex < BlockOrder.size());
183 
184     // Assign fragments based on the computed best split index.
185     // All basic blocks with index up to the best split index become hot.
186     // All remaining blocks are warm / cold depending on if count is
187     // greater than zero or not.
188     for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
189       BinaryBasicBlock *BB = BlockOrder[Index];
190       if (Index <= BestSplitIndex)
191         BB->setFragmentNum(FragmentNum::main());
192       else
193         BB->setFragmentNum(BB->getKnownExecutionCount() > 0
194                                ? FragmentNum::warm()
195                                : FragmentNum::cold());
196     }
197   }
198 
199 private:
200   struct CallInfo {
201     size_t Length;
202     size_t Count;
203   };
204 
205   struct SplitScore {
206     size_t SplitIndex = size_t(-1);
207     size_t HotSizeReduction = 0;
208     double LocalScore = 0;
209     double CoverCallScore = 0;
210 
211     double sum() const { return LocalScore + CoverCallScore; }
212   };
213 
214   // Auxiliary variables used by the algorithm.
215   size_t TotalNumBlocks{0};
216   size_t OrigHotSectionSize{0};
217   DenseMap<const BinaryBasicBlock *, size_t> GlobalIndices;
218   DenseMap<const BinaryBasicBlock *, size_t> BBSizes;
219   DenseMap<const BinaryBasicBlock *, size_t> BBOffsets;
220 
221   // Call graph.
222   std::vector<SmallVector<const BinaryBasicBlock *, 0>> Callers;
223   std::vector<SmallVector<const BinaryBasicBlock *, 0>> Callees;
224 
225   bool shouldConsiderForCallGraph(const BinaryFunction &BF) {
226     // Only a subset of the functions in the binary will be considered
227     // for initializing auxiliary variables and building call graph.
228     return BF.hasValidIndex() && BF.hasValidProfile() && !BF.empty();
229   }
230 
231   void initializeAuxiliaryVariables() {
232     for (BinaryFunction *BF : BC.getSortedFunctions()) {
233       if (!shouldConsiderForCallGraph(*BF))
234         continue;
235 
236       // Calculate the size of each BB after hot-cold splitting.
237       // This populates BinaryBasicBlock::OutputAddressRange which
238       // can be used to compute the size of each BB.
239       BC.calculateEmittedSize(*BF, /*FixBranches=*/true);
240 
241       for (BinaryBasicBlock *BB : BF->getLayout().blocks()) {
242         // Unique global index.
243         GlobalIndices[BB] = TotalNumBlocks;
244         TotalNumBlocks++;
245 
246         // Block size after hot-cold splitting.
247         BBSizes[BB] = BB->getOutputSize();
248 
249         // Hot block offset after hot-cold splitting.
250         BBOffsets[BB] = OrigHotSectionSize;
251         if (!BB->isSplit())
252           OrigHotSectionSize += BBSizes[BB];
253       }
254     }
255   }
256 
257   void buildCallGraph() {
258     Callers.resize(TotalNumBlocks);
259     Callees.resize(TotalNumBlocks);
260     for (const BinaryFunction *SrcFunction : BC.getSortedFunctions()) {
261       if (!shouldConsiderForCallGraph(*SrcFunction))
262         continue;
263 
264       for (BinaryBasicBlock &SrcBB : SrcFunction->blocks()) {
265         // Skip blocks that are not executed
266         if (SrcBB.getKnownExecutionCount() == 0)
267           continue;
268 
269         // Find call instructions and extract target symbols from each one
270         for (const MCInst &Inst : SrcBB) {
271           if (!BC.MIB->isCall(Inst))
272             continue;
273 
274           // Call info
275           const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst);
276           // Ignore calls w/o information
277           if (!DstSym)
278             continue;
279 
280           const BinaryFunction *DstFunction = BC.getFunctionForSymbol(DstSym);
281           // Ignore calls that do not have a valid target, but do not ignore
282           // recursive calls, because caller block could be moved to warm.
283           if (!DstFunction || DstFunction->getLayout().block_empty())
284             continue;
285 
286           const BinaryBasicBlock *DstBB = &(DstFunction->front());
287 
288           // Record the call only if DstBB is also in functions to consider for
289           // call graph.
290           if (GlobalIndices.contains(DstBB)) {
291             Callers[GlobalIndices[DstBB]].push_back(&SrcBB);
292             Callees[GlobalIndices[&SrcBB]].push_back(DstBB);
293           }
294         }
295       }
296     }
297   }
298 
299   /// Populate BinaryBasicBlock::OutputAddressRange with estimated basic block
300   /// start and end addresses for hot and warm basic blocks, assuming hot-warm
301   /// splitting happens at \p SplitIndex. Also return estimated end addresses
302   /// of the hot fragment before and after splitting.
303   /// The estimations take into account the potential addition of branch
304   /// instructions due to split fall through branches as well as the need to
305   /// use longer branch instructions for split (un)conditional branches.
306   std::pair<size_t, size_t>
307   estimatePostSplitBBAddress(const BasicBlockOrder &BlockOrder,
308                              const size_t SplitIndex) {
309     assert(SplitIndex < BlockOrder.size() && "Invalid split index");
310 
311     // Update function layout assuming hot-warm splitting at SplitIndex.
312     for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
313       BinaryBasicBlock *BB = BlockOrder[Index];
314       if (BB->getFragmentNum() == FragmentNum::cold())
315         break;
316       BB->setFragmentNum(Index <= SplitIndex ? FragmentNum::main()
317                                              : FragmentNum::warm());
318     }
319     BinaryFunction *BF = BlockOrder[0]->getFunction();
320     BF->getLayout().update(BlockOrder);
321     // Populate BB.OutputAddressRange under the updated layout.
322     BC.calculateEmittedSize(*BF);
323 
324     // Populate BB.OutputAddressRange with estimated new start and end addresses
325     // and compute the old end address of the hot section and the new end
326     // address of the hot section.
327     size_t OldHotEndAddr{0};
328     size_t NewHotEndAddr{0};
329     size_t CurrentAddr = BBOffsets[BlockOrder[0]];
330     for (BinaryBasicBlock *BB : BlockOrder) {
331       // We only care about new addresses of blocks in hot/warm.
332       if (BB->getFragmentNum() == FragmentNum::cold())
333         break;
334       const size_t NewSize = BB->getOutputSize();
335       BB->setOutputStartAddress(CurrentAddr);
336       CurrentAddr += NewSize;
337       BB->setOutputEndAddress(CurrentAddr);
338       if (BB->getLayoutIndex() == SplitIndex) {
339         NewHotEndAddr = CurrentAddr;
340         // Approximate the start address of the warm fragment of the current
341         // function using the original hot section size.
342         CurrentAddr = OrigHotSectionSize;
343       }
344       OldHotEndAddr = BBOffsets[BB] + BBSizes[BB];
345     }
346     return std::make_pair(OldHotEndAddr, NewHotEndAddr);
347   }
348 
349   /// Get a collection of "shortenable" calls, that is, calls of type X->Y
350   /// when the function order is [... X ... BF ... Y ...].
351   /// If the hot fragment size of BF is reduced, then such calls are guaranteed
352   /// to get shorter by the reduced hot fragment size.
353   std::vector<CallInfo> extractCoverCalls(const BinaryFunction &BF) {
354     // Record the length and the count of the calls that can be shortened
355     std::vector<CallInfo> CoverCalls;
356     if (opts::CallScale == 0)
357       return CoverCalls;
358 
359     const BinaryFunction *ThisBF = &BF;
360     const BinaryBasicBlock *ThisBB = &(ThisBF->front());
361     const size_t ThisGI = GlobalIndices[ThisBB];
362 
363     for (const BinaryFunction *DstBF : BC.getSortedFunctions()) {
364       if (!shouldConsiderForCallGraph(*DstBF))
365         continue;
366 
367       const BinaryBasicBlock *DstBB = &(DstBF->front());
368       if (DstBB->getKnownExecutionCount() == 0)
369         continue;
370 
371       const size_t DstGI = GlobalIndices[DstBB];
372       for (const BinaryBasicBlock *SrcBB : Callers[DstGI]) {
373         const BinaryFunction *SrcBF = SrcBB->getFunction();
374         if (ThisBF == SrcBF)
375           continue;
376 
377         const size_t CallCount = SrcBB->getKnownExecutionCount();
378 
379         const size_t SrcGI = GlobalIndices[SrcBB];
380 
381         const bool IsCoverCall = (SrcGI < ThisGI && ThisGI < DstGI) ||
382                                  (DstGI <= ThisGI && ThisGI < SrcGI);
383         if (!IsCoverCall)
384           continue;
385 
386         const size_t SrcBBEndAddr = BBOffsets[SrcBB] + BBSizes[SrcBB];
387         const size_t DstBBStartAddr = BBOffsets[DstBB];
388         const size_t CallLength =
389             AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr);
390         const CallInfo CI{CallLength, CallCount};
391         CoverCalls.emplace_back(CI);
392       }
393     }
394     return CoverCalls;
395   }
396 
397   /// Compute the edge score of a call edge.
398   double computeCallScore(uint64_t CallCount, size_t CallLength) {
399     // Increase call lengths by 1 to avoid raising 0 to a negative power.
400     return opts::CallScale * static_cast<double>(CallCount) /
401            std::pow(static_cast<double>(CallLength + 1), opts::CallPower);
402   }
403 
404   /// Compute the edge score of a jump (branch) edge.
405   double computeJumpScore(uint64_t JumpCount, size_t JumpLength) {
406     // Increase jump lengths by 1 to avoid raising 0 to a negative power.
407     return static_cast<double>(JumpCount) /
408            std::pow(static_cast<double>(JumpLength + 1), opts::JumpPower);
409   }
410 
411   /// Compute sum of scores over jumps within \p BlockOrder given \p SplitIndex.
412   /// Increament Score.LocalScore in place by the sum.
413   void computeJumpScore(const BasicBlockOrder &BlockOrder,
414                         const size_t SplitIndex, SplitScore &Score) {
415 
416     for (const BinaryBasicBlock *SrcBB : BlockOrder) {
417       if (SrcBB->getKnownExecutionCount() == 0)
418         continue;
419 
420       const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second;
421 
422       for (const auto Pair : zip(SrcBB->successors(), SrcBB->branch_info())) {
423         const BinaryBasicBlock *DstBB = std::get<0>(Pair);
424         const BinaryBasicBlock::BinaryBranchInfo &Branch = std::get<1>(Pair);
425         const size_t JumpCount = Branch.Count;
426 
427         if (JumpCount == 0)
428           continue;
429 
430         const size_t DstBBStartAddr = DstBB->getOutputAddressRange().first;
431         const size_t NewJumpLength =
432             AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr);
433         Score.LocalScore += computeJumpScore(JumpCount, NewJumpLength);
434       }
435     }
436   }
437 
438   /// Compute sum of scores over calls originated in the current function
439   /// given \p SplitIndex. Increament Score.LocalScore in place by the sum.
440   void computeLocalCallScore(const BasicBlockOrder &BlockOrder,
441                              const size_t SplitIndex, SplitScore &Score) {
442     if (opts::CallScale == 0)
443       return;
444 
445     // Global index of the last block in the current function.
446     // This is later used to determine whether a call originated in the current
447     // function is to a function that comes after the current function.
448     const size_t LastGlobalIndex = GlobalIndices[BlockOrder.back()];
449 
450     // The length of calls originated in the input function can increase /
451     // decrease depending on the splitting decision.
452     for (const BinaryBasicBlock *SrcBB : BlockOrder) {
453       const size_t CallCount = SrcBB->getKnownExecutionCount();
454       // If SrcBB does not call any functions, skip it.
455       if (CallCount == 0)
456         continue;
457 
458       // Obtain an estimate on the end address of the src basic block
459       // after splitting at SplitIndex.
460       const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second;
461 
462       for (const BinaryBasicBlock *DstBB : Callees[GlobalIndices[SrcBB]]) {
463         // Obtain an estimate on the start address of the dst basic block
464         // after splitting at SplitIndex. If DstBB is in a function before
465         // the current function, then its start address remains unchanged.
466         size_t DstBBStartAddr = BBOffsets[DstBB];
467         // If DstBB is in a function after the current function, then its
468         // start address should be adjusted based on the reduction in hot size.
469         if (GlobalIndices[DstBB] > LastGlobalIndex) {
470           assert(DstBBStartAddr >= Score.HotSizeReduction);
471           DstBBStartAddr -= Score.HotSizeReduction;
472         }
473         const size_t NewCallLength =
474             AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr);
475         Score.LocalScore += computeCallScore(CallCount, NewCallLength);
476       }
477     }
478   }
479 
480   /// Compute sum of splitting scores for cover calls of the input function.
481   /// Increament Score.CoverCallScore in place by the sum.
482   void computeCoverCallScore(const BasicBlockOrder &BlockOrder,
483                              const size_t SplitIndex,
484                              const std::vector<CallInfo> &CoverCalls,
485                              SplitScore &Score) {
486     if (opts::CallScale == 0)
487       return;
488 
489     for (const CallInfo CI : CoverCalls) {
490       assert(CI.Length >= Score.HotSizeReduction &&
491              "Length of cover calls must exceed reduced size of hot fragment.");
492       // Compute the new length of the call, which is shorter than the original
493       // one by the size of the splitted fragment minus the total size increase.
494       const size_t NewCallLength = CI.Length - Score.HotSizeReduction;
495       Score.CoverCallScore += computeCallScore(CI.Count, NewCallLength);
496     }
497   }
498 
499   /// Compute the split score of splitting a function at a given index.
500   /// The split score consists of local score and cover score. This function
501   /// returns \p Score of SplitScore type. It contains the local score and
502   /// cover score of the current splitting index. For easier book keeping and
503   /// comparison, it also stores the split index and the resulting reduction
504   /// in hot fragment size.
505   SplitScore computeSplitScore(const BinaryFunction &BF,
506                                const BasicBlockOrder &BlockOrder,
507                                const size_t SplitIndex,
508                                const std::vector<CallInfo> &CoverCalls) {
509     // Populate BinaryBasicBlock::OutputAddressRange with estimated
510     // new start and end addresses after hot-warm splitting at SplitIndex.
511     size_t OldHotEnd;
512     size_t NewHotEnd;
513     std::tie(OldHotEnd, NewHotEnd) =
514         estimatePostSplitBBAddress(BlockOrder, SplitIndex);
515 
516     SplitScore Score;
517     Score.SplitIndex = SplitIndex;
518 
519     // It's not worth splitting if OldHotEnd < NewHotEnd.
520     if (OldHotEnd < NewHotEnd)
521       return Score;
522 
523     // Hot fragment size reduction due to splitting.
524     Score.HotSizeReduction = OldHotEnd - NewHotEnd;
525 
526     // First part of LocalScore is the sum over call edges originated in the
527     // input function. These edges can get shorter or longer depending on
528     // SplitIndex. Score.LocalScore is increamented in place.
529     computeLocalCallScore(BlockOrder, SplitIndex, Score);
530 
531     // Second part of LocalScore is the sum over jump edges with src basic block
532     // and dst basic block in the current function. Score.LocalScore is
533     // increamented in place.
534     computeJumpScore(BlockOrder, SplitIndex, Score);
535 
536     // Compute CoverCallScore and store in Score in place.
537     computeCoverCallScore(BlockOrder, SplitIndex, CoverCalls, Score);
538     return Score;
539   }
540 
541   /// Find the most likely successor of a basic block when it has one or two
542   /// successors. Return nullptr otherwise.
543   const BinaryBasicBlock *getMostLikelySuccessor(const BinaryBasicBlock *BB) {
544     if (BB->succ_size() == 1)
545       return BB->getSuccessor();
546     if (BB->succ_size() == 2) {
547       uint64_t TakenCount = BB->getTakenBranchInfo().Count;
548       assert(TakenCount != BinaryBasicBlock::COUNT_NO_PROFILE);
549       uint64_t NonTakenCount = BB->getFallthroughBranchInfo().Count;
550       assert(NonTakenCount != BinaryBasicBlock::COUNT_NO_PROFILE);
551       if (TakenCount > NonTakenCount)
552         return BB->getConditionalSuccessor(true);
553       else if (TakenCount < NonTakenCount)
554         return BB->getConditionalSuccessor(false);
555     }
556     return nullptr;
557   }
558 
559   /// Find the best index for splitting. The returned value is the index of the
560   /// last hot basic block. Hence, "no splitting" is equivalent to returning the
561   /// value which is one less than the size of the function.
562   size_t findSplitIndex(const BinaryFunction &BF,
563                         const BasicBlockOrder &BlockOrder) {
564     assert(BlockOrder.size() > 2);
565     // Find all function calls that can be shortened if we move blocks of the
566     // current function to warm/cold
567     const std::vector<CallInfo> CoverCalls = extractCoverCalls(BF);
568 
569     // Find the existing hot-cold splitting index.
570     size_t HotColdIndex = 0;
571     while (HotColdIndex + 1 < BlockOrder.size()) {
572       if (BlockOrder[HotColdIndex + 1]->getFragmentNum() == FragmentNum::cold())
573         break;
574       HotColdIndex++;
575     }
576     assert(HotColdIndex + 1 == BlockOrder.size() ||
577            (BlockOrder[HotColdIndex]->getFragmentNum() == FragmentNum::main() &&
578             BlockOrder[HotColdIndex + 1]->getFragmentNum() ==
579                 FragmentNum::cold()));
580 
581     // Try all possible split indices up to HotColdIndex (blocks that have
582     // Index <= SplitIndex are in hot) and find the one maximizing the
583     // splitting score.
584     SplitScore BestScore;
585     for (size_t Index = 0; Index <= HotColdIndex; Index++) {
586       const BinaryBasicBlock *LastHotBB = BlockOrder[Index];
587       assert(LastHotBB->getFragmentNum() != FragmentNum::cold());
588 
589       // Do not break jump to the most likely successor.
590       if (Index + 1 < BlockOrder.size() &&
591           BlockOrder[Index + 1] == getMostLikelySuccessor(LastHotBB))
592         continue;
593 
594       const SplitScore Score =
595           computeSplitScore(BF, BlockOrder, Index, CoverCalls);
596       if (Score.sum() > BestScore.sum())
597         BestScore = Score;
598     }
599 
600     // If we don't find a good splitting point, fallback to the original one.
601     if (BestScore.SplitIndex == size_t(-1))
602       return HotColdIndex;
603 
604     return BestScore.SplitIndex;
605   }
606 };
607 
608 struct SplitRandom2 final : public SplitStrategy {
609   std::minstd_rand0 Gen;
610 
611   SplitRandom2() : Gen(opts::RandomSeed.getValue()) {}
612 
613   bool canSplit(const BinaryFunction &BF) override { return true; }
614 
615   bool compactFragments() override { return true; }
616 
617   void fragment(const BlockIt Start, const BlockIt End) override {
618     using DiffT = typename std::iterator_traits<BlockIt>::difference_type;
619     const DiffT NumBlocks = End - Start;
620     assert(NumBlocks > 0 && "Cannot fragment empty function");
621 
622     // We want to split at least one block
623     const auto LastSplitPoint = std::max<DiffT>(NumBlocks - 1, 1);
624     std::uniform_int_distribution<DiffT> Dist(1, LastSplitPoint);
625     const DiffT SplitPoint = Dist(Gen);
626     for (BinaryBasicBlock *BB : llvm::make_range(Start + SplitPoint, End))
627       BB->setFragmentNum(FragmentNum::cold());
628 
629     LLVM_DEBUG(dbgs() << formatv("BOLT-DEBUG: randomly chose last {0} (out of "
630                                  "{1} possible) blocks to split\n",
631                                  NumBlocks - SplitPoint, End - Start));
632   }
633 };
634 
635 struct SplitRandomN final : public SplitStrategy {
636   std::minstd_rand0 Gen;
637 
638   SplitRandomN() : Gen(opts::RandomSeed.getValue()) {}
639 
640   bool canSplit(const BinaryFunction &BF) override { return true; }
641 
642   bool compactFragments() override { return true; }
643 
644   void fragment(const BlockIt Start, const BlockIt End) override {
645     using DiffT = typename std::iterator_traits<BlockIt>::difference_type;
646     const DiffT NumBlocks = End - Start;
647     assert(NumBlocks > 0 && "Cannot fragment empty function");
648 
649     // With n blocks, there are n-1 places to split them.
650     const DiffT MaximumSplits = NumBlocks - 1;
651     // We want to generate at least two fragment if possible, but if there is
652     // only one block, no splits are possible.
653     const auto MinimumSplits = std::min<DiffT>(MaximumSplits, 1);
654     std::uniform_int_distribution<DiffT> Dist(MinimumSplits, MaximumSplits);
655     // Choose how many splits to perform
656     const DiffT NumSplits = Dist(Gen);
657 
658     // Draw split points from a lottery
659     SmallVector<unsigned, 0> Lottery(MaximumSplits);
660     // Start lottery at 1, because there is no meaningful splitpoint before the
661     // first block.
662     std::iota(Lottery.begin(), Lottery.end(), 1u);
663     std::shuffle(Lottery.begin(), Lottery.end(), Gen);
664     Lottery.resize(NumSplits);
665     llvm::sort(Lottery);
666 
667     // Add one past the end entry to lottery
668     Lottery.push_back(NumBlocks);
669 
670     unsigned LotteryIndex = 0;
671     unsigned BBPos = 0;
672     for (BinaryBasicBlock *const BB : make_range(Start, End)) {
673       // Check whether to start new fragment
674       if (BBPos >= Lottery[LotteryIndex])
675         ++LotteryIndex;
676 
677       // Because LotteryIndex is 0 based and cold fragments are 1 based, we can
678       // use the index to assign fragments.
679       BB->setFragmentNum(FragmentNum(LotteryIndex));
680 
681       ++BBPos;
682     }
683   }
684 };
685 
686 struct SplitAll final : public SplitStrategy {
687   bool canSplit(const BinaryFunction &BF) override { return true; }
688 
689   bool compactFragments() override {
690     // Keeping empty fragments allows us to test, that empty fragments do not
691     // generate symbols.
692     return false;
693   }
694 
695   void fragment(const BlockIt Start, const BlockIt End) override {
696     unsigned Fragment = 0;
697     for (BinaryBasicBlock *const BB : llvm::make_range(Start, End))
698       BB->setFragmentNum(FragmentNum(Fragment++));
699   }
700 };
701 } // namespace
702 
703 namespace llvm {
704 namespace bolt {
705 
706 bool SplitFunctions::shouldOptimize(const BinaryFunction &BF) const {
707   // Apply execution count threshold
708   if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
709     return false;
710 
711   return BinaryFunctionPass::shouldOptimize(BF);
712 }
713 
714 Error SplitFunctions::runOnFunctions(BinaryContext &BC) {
715   if (!opts::SplitFunctions)
716     return Error::success();
717 
718   if (BC.IsLinuxKernel && BC.BOLTReserved.empty()) {
719     BC.errs() << "BOLT-ERROR: split functions require reserved space in the "
720                  "Linux kernel binary\n";
721     exit(1);
722   }
723 
724   // If split strategy is not CDSplit, then a second run of the pass is not
725   // needed after function reordering.
726   if (BC.HasFinalizedFunctionOrder &&
727       opts::SplitStrategy != SplitFunctionsStrategy::CDSplit)
728     return Error::success();
729 
730   std::unique_ptr<SplitStrategy> Strategy;
731   bool ForceSequential = false;
732 
733   switch (opts::SplitStrategy) {
734   case SplitFunctionsStrategy::CDSplit:
735     // CDSplit runs two splitting passes: hot-cold splitting (SplitPrfoile2)
736     // before function reordering and hot-warm-cold splitting
737     // (SplitCacheDirected) after function reordering.
738     if (BC.HasFinalizedFunctionOrder)
739       Strategy = std::make_unique<SplitCacheDirected>(BC);
740     else
741       Strategy = std::make_unique<SplitProfile2>();
742     opts::AggressiveSplitting = true;
743     BC.HasWarmSection = true;
744     break;
745   case SplitFunctionsStrategy::Profile2:
746     Strategy = std::make_unique<SplitProfile2>();
747     break;
748   case SplitFunctionsStrategy::Random2:
749     Strategy = std::make_unique<SplitRandom2>();
750     // If we split functions randomly, we need to ensure that across runs with
751     // the same input, we generate random numbers for each function in the same
752     // order.
753     ForceSequential = true;
754     break;
755   case SplitFunctionsStrategy::RandomN:
756     Strategy = std::make_unique<SplitRandomN>();
757     ForceSequential = true;
758     break;
759   case SplitFunctionsStrategy::All:
760     Strategy = std::make_unique<SplitAll>();
761     break;
762   }
763 
764   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
765     return !shouldOptimize(BF);
766   };
767 
768   ParallelUtilities::runOnEachFunction(
769       BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR,
770       [&](BinaryFunction &BF) { splitFunction(BF, *Strategy); }, SkipFunc,
771       "SplitFunctions", ForceSequential);
772 
773   if (SplitBytesHot + SplitBytesCold > 0)
774     BC.outs() << "BOLT-INFO: splitting separates " << SplitBytesHot
775               << " hot bytes from " << SplitBytesCold << " cold bytes "
776               << format("(%.2lf%% of split functions is hot).\n",
777                         100.0 * SplitBytesHot /
778                             (SplitBytesHot + SplitBytesCold));
779   return Error::success();
780 }
781 
782 void SplitFunctions::splitFunction(BinaryFunction &BF, SplitStrategy &S) {
783   if (BF.empty())
784     return;
785 
786   if (!S.canSplit(BF))
787     return;
788 
789   FunctionLayout &Layout = BF.getLayout();
790   BinaryFunction::BasicBlockOrderType PreSplitLayout(Layout.block_begin(),
791                                                      Layout.block_end());
792 
793   BinaryContext &BC = BF.getBinaryContext();
794   size_t OriginalHotSize;
795   size_t HotSize;
796   size_t ColdSize;
797   if (BC.isX86()) {
798     std::tie(OriginalHotSize, ColdSize) = BC.calculateEmittedSize(BF);
799     LLVM_DEBUG(dbgs() << "Estimated size for function " << BF
800                       << " pre-split is <0x"
801                       << Twine::utohexstr(OriginalHotSize) << ", 0x"
802                       << Twine::utohexstr(ColdSize) << ">\n");
803   }
804 
805   BinaryFunction::BasicBlockOrderType NewLayout(Layout.block_begin(),
806                                                 Layout.block_end());
807   // Never outline the first basic block.
808   NewLayout.front()->setCanOutline(false);
809   for (BinaryBasicBlock *const BB : NewLayout) {
810     if (!BB->canOutline())
811       continue;
812 
813     // Do not split extra entry points in aarch64. They can be referred by
814     // using ADRs and when this happens, these blocks cannot be placed far
815     // away due to the limited range in ADR instruction.
816     if (BC.isAArch64() && BB->isEntryPoint()) {
817       BB->setCanOutline(false);
818       continue;
819     }
820 
821     if (BF.hasEHRanges() && !opts::SplitEH) {
822       // We cannot move landing pads (or rather entry points for landing pads).
823       if (BB->isLandingPad()) {
824         BB->setCanOutline(false);
825         continue;
826       }
827       // We cannot move a block that can throw since exception-handling
828       // runtime cannot deal with split functions. However, if we can guarantee
829       // that the block never throws, it is safe to move the block to
830       // decrease the size of the function.
831       for (MCInst &Instr : *BB) {
832         if (BC.MIB->isInvoke(Instr)) {
833           BB->setCanOutline(false);
834           break;
835         }
836       }
837     }
838 
839     // Outlining blocks with dynamic branches is not supported yet.
840     if (BC.IsLinuxKernel) {
841       if (llvm::any_of(
842               *BB, [&](MCInst &Inst) { return BC.MIB->isDynamicBranch(Inst); }))
843         BB->setCanOutline(false);
844     }
845   }
846 
847   BF.getLayout().updateLayoutIndices();
848   S.fragment(NewLayout.begin(), NewLayout.end());
849 
850   // Make sure all non-outlineable blocks are in the main-fragment.
851   for (BinaryBasicBlock *const BB : NewLayout) {
852     if (!BB->canOutline())
853       BB->setFragmentNum(FragmentNum::main());
854   }
855 
856   if (opts::AggressiveSplitting) {
857     // All blocks with 0 count that we can move go to the end of the function.
858     // Even if they were natural to cluster formation and were seen in-between
859     // hot basic blocks.
860     llvm::stable_sort(NewLayout, [&](const BinaryBasicBlock *const A,
861                                      const BinaryBasicBlock *const B) {
862       return A->getFragmentNum() < B->getFragmentNum();
863     });
864   } else if (BF.hasEHRanges() && !opts::SplitEH) {
865     // Typically functions with exception handling have landing pads at the end.
866     // We cannot move beginning of landing pads, but we can move 0-count blocks
867     // comprising landing pads to the end and thus facilitate splitting.
868     auto FirstLP = NewLayout.begin();
869     while ((*FirstLP)->isLandingPad())
870       ++FirstLP;
871 
872     std::stable_sort(FirstLP, NewLayout.end(),
873                      [&](BinaryBasicBlock *A, BinaryBasicBlock *B) {
874                        return A->getFragmentNum() < B->getFragmentNum();
875                      });
876   }
877 
878   // Make sure that fragments are increasing.
879   FragmentNum CurrentFragment = NewLayout.back()->getFragmentNum();
880   for (BinaryBasicBlock *const BB : reverse(NewLayout)) {
881     if (BB->getFragmentNum() > CurrentFragment)
882       BB->setFragmentNum(CurrentFragment);
883     CurrentFragment = BB->getFragmentNum();
884   }
885 
886   if (S.compactFragments()) {
887     FragmentNum CurrentFragment = FragmentNum::main();
888     FragmentNum NewFragment = FragmentNum::main();
889     for (BinaryBasicBlock *const BB : NewLayout) {
890       if (BB->getFragmentNum() > CurrentFragment) {
891         CurrentFragment = BB->getFragmentNum();
892         NewFragment = FragmentNum(NewFragment.get() + 1);
893       }
894       BB->setFragmentNum(NewFragment);
895     }
896   }
897 
898   const bool LayoutUpdated = BF.getLayout().update(NewLayout);
899 
900   // For shared objects, invoke instructions and corresponding landing pads
901   // have to be placed in the same fragment. When we split them, create
902   // trampoline landing pads that will redirect the execution to real LPs.
903   TrampolineSetType Trampolines;
904   if (BF.hasEHRanges() && BF.isSplit()) {
905     // If all landing pads for this fragment are grouped in one (potentially
906     // different) fragment, we can set LPStart to the start of that fragment
907     // and avoid trampoline code.
908     bool NeedsTrampolines = false;
909     for (FunctionFragment &FF : BF.getLayout().fragments()) {
910       // Vector of fragments that contain landing pads for this fragment.
911       SmallVector<FragmentNum, 4> LandingPadFragments;
912       for (const BinaryBasicBlock *BB : FF)
913         for (const BinaryBasicBlock *LPB : BB->landing_pads())
914           LandingPadFragments.push_back(LPB->getFragmentNum());
915 
916       // Eliminate duplicate entries from the vector.
917       llvm::sort(LandingPadFragments);
918       auto Last = llvm::unique(LandingPadFragments);
919       LandingPadFragments.erase(Last, LandingPadFragments.end());
920 
921       if (LandingPadFragments.size() == 0) {
922         // If the fragment has no landing pads, we can safely set itself as its
923         // landing pad fragment.
924         BF.setLPFragment(FF.getFragmentNum(), FF.getFragmentNum());
925       } else if (LandingPadFragments.size() == 1) {
926         BF.setLPFragment(FF.getFragmentNum(), LandingPadFragments.front());
927       } else {
928         if (!BC.HasFixedLoadAddress) {
929           NeedsTrampolines = true;
930           break;
931         } else {
932           BF.setLPFragment(FF.getFragmentNum(), std::nullopt);
933         }
934       }
935     }
936 
937     // Trampolines guarantee that all landing pads for any given fragment will
938     // be contained in the same fragment.
939     if (NeedsTrampolines) {
940       for (FunctionFragment &FF : BF.getLayout().fragments())
941         BF.setLPFragment(FF.getFragmentNum(), FF.getFragmentNum());
942       Trampolines = createEHTrampolines(BF);
943     }
944   }
945 
946   // Check the new size to see if it's worth splitting the function.
947   if (BC.isX86() && LayoutUpdated) {
948     std::tie(HotSize, ColdSize) = BC.calculateEmittedSize(BF);
949     LLVM_DEBUG(dbgs() << "Estimated size for function " << BF
950                       << " post-split is <0x" << Twine::utohexstr(HotSize)
951                       << ", 0x" << Twine::utohexstr(ColdSize) << ">\n");
952     if (alignTo(OriginalHotSize, opts::SplitAlignThreshold) <=
953         alignTo(HotSize, opts::SplitAlignThreshold) + opts::SplitThreshold) {
954       if (opts::Verbosity >= 2) {
955         BC.outs() << "BOLT-INFO: Reversing splitting of function "
956                   << formatv("{0}:\n  {1:x}, {2:x} -> {3:x}\n", BF, HotSize,
957                              ColdSize, OriginalHotSize);
958       }
959 
960       // Reverse the action of createEHTrampolines(). The trampolines will be
961       // placed immediately before the matching destination resulting in no
962       // extra code.
963       if (PreSplitLayout.size() != BF.size())
964         PreSplitLayout = mergeEHTrampolines(BF, PreSplitLayout, Trampolines);
965 
966       for (BinaryBasicBlock &BB : BF)
967         BB.setFragmentNum(FragmentNum::main());
968       BF.getLayout().update(PreSplitLayout);
969     } else {
970       SplitBytesHot += HotSize;
971       SplitBytesCold += ColdSize;
972     }
973   }
974 
975   // Restore LP fragment for the main fragment if the splitting was undone.
976   if (BF.hasEHRanges() && !BF.isSplit())
977     BF.setLPFragment(FragmentNum::main(), FragmentNum::main());
978 
979   // Fix branches if the splitting decision of the pass after function
980   // reordering is different from that of the pass before function reordering.
981   if (LayoutUpdated && BC.HasFinalizedFunctionOrder)
982     BF.fixBranches();
983 }
984 
985 SplitFunctions::TrampolineSetType
986 SplitFunctions::createEHTrampolines(BinaryFunction &BF) const {
987   const auto &MIB = BF.getBinaryContext().MIB;
988 
989   // Map real landing pads to the corresponding trampolines.
990   TrampolineSetType LPTrampolines;
991 
992   // Iterate over the copy of basic blocks since we are adding new blocks to the
993   // function which will invalidate its iterators.
994   std::vector<BinaryBasicBlock *> Blocks(BF.pbegin(), BF.pend());
995   for (BinaryBasicBlock *BB : Blocks) {
996     for (MCInst &Instr : *BB) {
997       const std::optional<MCPlus::MCLandingPad> EHInfo = MIB->getEHInfo(Instr);
998       if (!EHInfo || !EHInfo->first)
999         continue;
1000 
1001       const MCSymbol *LPLabel = EHInfo->first;
1002       BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(LPLabel);
1003       if (BB->getFragmentNum() == LPBlock->getFragmentNum())
1004         continue;
1005 
1006       const MCSymbol *TrampolineLabel = nullptr;
1007       const TrampolineKey Key(BB->getFragmentNum(), LPLabel);
1008       auto Iter = LPTrampolines.find(Key);
1009       if (Iter != LPTrampolines.end()) {
1010         TrampolineLabel = Iter->second;
1011       } else {
1012         // Create a trampoline basic block in the same fragment as the thrower.
1013         // Note: there's no need to insert the jump instruction, it will be
1014         // added by fixBranches().
1015         BinaryBasicBlock *TrampolineBB = BF.addBasicBlock();
1016         TrampolineBB->setFragmentNum(BB->getFragmentNum());
1017         TrampolineBB->setExecutionCount(LPBlock->getExecutionCount());
1018         TrampolineBB->addSuccessor(LPBlock, TrampolineBB->getExecutionCount());
1019         TrampolineBB->setCFIState(LPBlock->getCFIState());
1020         TrampolineLabel = TrampolineBB->getLabel();
1021         LPTrampolines.insert(std::make_pair(Key, TrampolineLabel));
1022       }
1023 
1024       // Substitute the landing pad with the trampoline.
1025       MIB->updateEHInfo(Instr,
1026                         MCPlus::MCLandingPad(TrampolineLabel, EHInfo->second));
1027     }
1028   }
1029 
1030   if (LPTrampolines.empty())
1031     return LPTrampolines;
1032 
1033   // All trampoline blocks were added to the end of the function. Place them at
1034   // the end of corresponding fragments.
1035   BinaryFunction::BasicBlockOrderType NewLayout(BF.getLayout().block_begin(),
1036                                                 BF.getLayout().block_end());
1037   stable_sort(NewLayout, [&](BinaryBasicBlock *A, BinaryBasicBlock *B) {
1038     return A->getFragmentNum() < B->getFragmentNum();
1039   });
1040   BF.getLayout().update(NewLayout);
1041 
1042   // Conservatively introduce branch instructions.
1043   BF.fixBranches();
1044 
1045   // Update exception-handling CFG for the function.
1046   BF.recomputeLandingPads();
1047 
1048   return LPTrampolines;
1049 }
1050 
1051 SplitFunctions::BasicBlockOrderType SplitFunctions::mergeEHTrampolines(
1052     BinaryFunction &BF, SplitFunctions::BasicBlockOrderType &Layout,
1053     const SplitFunctions::TrampolineSetType &Trampolines) const {
1054   DenseMap<const MCSymbol *, SmallVector<const MCSymbol *, 0>>
1055       IncomingTrampolines;
1056   for (const auto &Entry : Trampolines) {
1057     IncomingTrampolines[Entry.getFirst().Target].emplace_back(
1058         Entry.getSecond());
1059   }
1060 
1061   BasicBlockOrderType MergedLayout;
1062   for (BinaryBasicBlock *BB : Layout) {
1063     auto Iter = IncomingTrampolines.find(BB->getLabel());
1064     if (Iter != IncomingTrampolines.end()) {
1065       for (const MCSymbol *const Trampoline : Iter->getSecond()) {
1066         BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(Trampoline);
1067         assert(LPBlock && "Could not find matching landing pad block.");
1068         MergedLayout.push_back(LPBlock);
1069       }
1070     }
1071     MergedLayout.push_back(BB);
1072   }
1073 
1074   return MergedLayout;
1075 }
1076 
1077 } // namespace bolt
1078 } // namespace llvm
1079