xref: /llvm-project/bolt/lib/Passes/SplitFunctions.cpp (revision 296088bdf371e8ecd4a1797521b90881d6026581)
1 //===- bolt/Passes/SplitFunctions.cpp - Pass for splitting function code --===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the SplitFunctions pass.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "bolt/Passes/SplitFunctions.h"
14 #include "bolt/Core/BinaryBasicBlock.h"
15 #include "bolt/Core/BinaryFunction.h"
16 #include "bolt/Core/FunctionLayout.h"
17 #include "bolt/Core/ParallelUtilities.h"
18 #include "bolt/Utils/CommandLineOpts.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/Sequence.h"
21 #include "llvm/ADT/SmallVector.h"
22 #include "llvm/ADT/iterator_range.h"
23 #include "llvm/Support/CommandLine.h"
24 #include "llvm/Support/FormatVariadic.h"
25 #include <algorithm>
26 #include <iterator>
27 #include <memory>
28 #include <numeric>
29 #include <random>
30 #include <vector>
31 
32 #define DEBUG_TYPE "bolt-opts"
33 
34 using namespace llvm;
35 using namespace bolt;
36 
37 namespace {
38 class DeprecatedSplitFunctionOptionParser : public cl::parser<bool> {
39 public:
40   explicit DeprecatedSplitFunctionOptionParser(cl::Option &O)
41       : cl::parser<bool>(O) {}
42 
43   bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, bool &Value) {
44     if (Arg == "2" || Arg == "3") {
45       Value = true;
46       errs() << formatv("BOLT-WARNING: specifying non-boolean value \"{0}\" "
47                         "for option -{1} is deprecated\n",
48                         Arg, ArgName);
49       return false;
50     }
51     return cl::parser<bool>::parse(O, ArgName, Arg, Value);
52   }
53 };
54 } // namespace
55 
56 namespace opts {
57 
58 extern cl::OptionCategory BoltOptCategory;
59 
60 extern cl::opt<bool> SplitEH;
61 extern cl::opt<unsigned> ExecutionCountThreshold;
62 extern cl::opt<uint32_t> RandomSeed;
63 
64 static cl::opt<bool> AggressiveSplitting(
65     "split-all-cold", cl::desc("outline as many cold basic blocks as possible"),
66     cl::cat(BoltOptCategory));
67 
68 static cl::opt<unsigned> SplitAlignThreshold(
69     "split-align-threshold",
70     cl::desc("when deciding to split a function, apply this alignment "
71              "while doing the size comparison (see -split-threshold). "
72              "Default value: 2."),
73     cl::init(2),
74 
75     cl::Hidden, cl::cat(BoltOptCategory));
76 
77 static cl::opt<bool, false, DeprecatedSplitFunctionOptionParser>
78     SplitFunctions("split-functions",
79                    cl::desc("split functions into fragments"),
80                    cl::cat(BoltOptCategory));
81 
82 static cl::opt<unsigned> SplitThreshold(
83     "split-threshold",
84     cl::desc("split function only if its main size is reduced by more than "
85              "given amount of bytes. Default value: 0, i.e. split iff the "
86              "size is reduced. Note that on some architectures the size can "
87              "increase after splitting."),
88     cl::init(0), cl::Hidden, cl::cat(BoltOptCategory));
89 
90 static cl::opt<SplitFunctionsStrategy> SplitStrategy(
91     "split-strategy", cl::init(SplitFunctionsStrategy::Profile2),
92     cl::values(clEnumValN(SplitFunctionsStrategy::Profile2, "profile2",
93                           "split each function into a hot and cold fragment "
94                           "using profiling information")),
95     cl::values(clEnumValN(SplitFunctionsStrategy::CDSplit, "cdsplit",
96                           "split each function into a hot, warm, and cold "
97                           "fragment using profiling information")),
98     cl::values(clEnumValN(
99         SplitFunctionsStrategy::Random2, "random2",
100         "split each function into a hot and cold fragment at a randomly chosen "
101         "split point (ignoring any available profiling information)")),
102     cl::values(clEnumValN(
103         SplitFunctionsStrategy::RandomN, "randomN",
104         "split each function into N fragments at a randomly chosen split "
105         "points (ignoring any available profiling information)")),
106     cl::values(clEnumValN(
107         SplitFunctionsStrategy::All, "all",
108         "split all basic blocks of each function into fragments such that each "
109         "fragment contains exactly a single basic block")),
110     cl::desc("strategy used to partition blocks into fragments"),
111     cl::cat(BoltOptCategory));
112 
113 static cl::opt<double> CallScale(
114     "call-scale",
115     cl::desc("Call score scale coefficient (when --split-strategy=cdsplit)"),
116     cl::init(0.95), cl::ReallyHidden, cl::cat(BoltOptCategory));
117 
118 static cl::opt<double>
119     CallPower("call-power",
120               cl::desc("Call score power (when --split-strategy=cdsplit)"),
121               cl::init(0.05), cl::ReallyHidden, cl::cat(BoltOptCategory));
122 
123 static cl::opt<double>
124     JumpPower("jump-power",
125               cl::desc("Jump score power (when --split-strategy=cdsplit)"),
126               cl::init(0.15), cl::ReallyHidden, cl::cat(BoltOptCategory));
127 } // namespace opts
128 
129 namespace {
130 bool hasFullProfile(const BinaryFunction &BF) {
131   return llvm::all_of(BF.blocks(), [](const BinaryBasicBlock &BB) {
132     return BB.getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE;
133   });
134 }
135 
136 bool allBlocksCold(const BinaryFunction &BF) {
137   return llvm::all_of(BF.blocks(), [](const BinaryBasicBlock &BB) {
138     return BB.getExecutionCount() == 0;
139   });
140 }
141 
142 struct SplitProfile2 final : public SplitStrategy {
143   bool canSplit(const BinaryFunction &BF) override {
144     return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF);
145   }
146 
147   bool compactFragments() override { return true; }
148 
149   void fragment(const BlockIt Start, const BlockIt End) override {
150     for (BinaryBasicBlock *const BB : llvm::make_range(Start, End)) {
151       if (BB->getExecutionCount() == 0)
152         BB->setFragmentNum(FragmentNum::cold());
153     }
154   }
155 };
156 
157 struct SplitCacheDirected final : public SplitStrategy {
158   BinaryContext &BC;
159   using BasicBlockOrder = BinaryFunction::BasicBlockOrderType;
160 
161   bool canSplit(const BinaryFunction &BF) override {
162     return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF);
163   }
164 
165   explicit SplitCacheDirected(BinaryContext &BC) : BC(BC) {
166     initializeAuxiliaryVariables();
167     buildCallGraph();
168   }
169 
170   // When some functions are hot-warm split and others are hot-warm-cold split,
171   // we do not want to change the fragment numbers of the blocks in the hot-warm
172   // split functions.
173   bool compactFragments() override { return false; }
174 
175   void fragment(const BlockIt Start, const BlockIt End) override {
176     BasicBlockOrder BlockOrder(Start, End);
177     BinaryFunction &BF = *BlockOrder.front()->getFunction();
178 
179     size_t BestSplitIndex = findSplitIndex(BF, BlockOrder);
180 
181     // Assign fragments based on the computed best split index.
182     // All basic blocks with index up to the best split index become hot.
183     // All remaining blocks are warm / cold depending on if count is
184     // greater than zero or not.
185     for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
186       BinaryBasicBlock *BB = BlockOrder[Index];
187       if (Index <= BestSplitIndex)
188         BB->setFragmentNum(FragmentNum::main());
189       else
190         BB->setFragmentNum(BB->getKnownExecutionCount() > 0
191                                ? FragmentNum::warm()
192                                : FragmentNum::cold());
193     }
194   }
195 
196 private:
197   struct CallInfo {
198     size_t Length;
199     size_t Count;
200   };
201 
202   struct SplitScore {
203     size_t SplitIndex;
204     size_t HotSizeReduction = 0;
205     double LocalScore = 0;
206     double CoverCallScore = 0;
207   };
208 
209   // Auxiliary variables used by the algorithm.
210   size_t TotalNumBlocks{0};
211   size_t OrigHotSectionSize{0};
212   DenseMap<const BinaryBasicBlock *, size_t> GlobalIndices;
213   DenseMap<const BinaryBasicBlock *, size_t> BBSizes;
214   DenseMap<const BinaryBasicBlock *, size_t> BBOffsets;
215 
216   // Call graph.
217   std::vector<SmallVector<const BinaryBasicBlock *, 0>> Callers;
218   std::vector<SmallVector<const BinaryBasicBlock *, 0>> Callees;
219 
220   bool shouldConsiderForCallGraph(const BinaryFunction &BF) {
221     // Only a subset of the functions in the binary will be considered
222     // for initializing auxiliary variables and building call graph.
223     return BF.hasValidIndex() && BF.hasValidProfile() && !BF.empty();
224   }
225 
226   void initializeAuxiliaryVariables() {
227     for (BinaryFunction *BF : BC.getSortedFunctions()) {
228       if (!shouldConsiderForCallGraph(*BF))
229         continue;
230 
231       // Calculate the size of each BB after hot-cold splitting.
232       // This populates BinaryBasicBlock::OutputAddressRange which
233       // can be used to compute the size of each BB.
234       BC.calculateEmittedSize(*BF, /*FixBranches=*/true);
235 
236       for (BinaryBasicBlock *BB : BF->getLayout().blocks()) {
237         // Unique global index.
238         GlobalIndices[BB] = TotalNumBlocks;
239         TotalNumBlocks++;
240 
241         // Block size after hot-cold splitting.
242         BBSizes[BB] = BB->getOutputSize();
243 
244         // Hot block offset after hot-cold splitting.
245         BBOffsets[BB] = OrigHotSectionSize;
246         if (!BB->isSplit())
247           OrigHotSectionSize += BBSizes[BB];
248       }
249     }
250   }
251 
252   void buildCallGraph() {
253     Callers.resize(TotalNumBlocks);
254     Callees.resize(TotalNumBlocks);
255     for (const BinaryFunction *SrcFunction : BC.getSortedFunctions()) {
256       if (!shouldConsiderForCallGraph(*SrcFunction))
257         continue;
258 
259       for (BinaryBasicBlock &SrcBB : SrcFunction->blocks()) {
260         // Skip blocks that are not executed
261         if (SrcBB.getKnownExecutionCount() == 0)
262           continue;
263 
264         // Find call instructions and extract target symbols from each one
265         for (const MCInst &Inst : SrcBB) {
266           if (!BC.MIB->isCall(Inst))
267             continue;
268 
269           // Call info
270           const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst);
271           // Ignore calls w/o information
272           if (!DstSym)
273             continue;
274 
275           const BinaryFunction *DstFunction = BC.getFunctionForSymbol(DstSym);
276           // Ignore calls that do not have a valid target, but do not ignore
277           // recursive calls, because caller block could be moved to warm.
278           if (!DstFunction || DstFunction->getLayout().block_empty())
279             continue;
280 
281           const BinaryBasicBlock *DstBB = &(DstFunction->front());
282 
283           // Record the call only if DstBB is also in functions to consider for
284           // call graph.
285           if (GlobalIndices.contains(DstBB)) {
286             Callers[GlobalIndices[DstBB]].push_back(&SrcBB);
287             Callees[GlobalIndices[&SrcBB]].push_back(DstBB);
288           }
289         }
290       }
291     }
292   }
293 
294   /// Populate BinaryBasicBlock::OutputAddressRange with estimated basic block
295   /// start and end addresses for hot and warm basic blocks, assuming hot-warm
296   /// splitting happens at \p SplitIndex. Also return estimated end addresses
297   /// of the hot fragment before and after splitting.
298   /// The estimations take into account the potential addition of branch
299   /// instructions due to split fall through branches as well as the need to
300   /// use longer branch instructions for split (un)conditional branches.
301   std::pair<size_t, size_t>
302   estimatePostSplitBBAddress(const BasicBlockOrder &BlockOrder,
303                              const size_t SplitIndex) {
304     assert(SplitIndex < BlockOrder.size() && "Invalid split index");
305 
306     // Update function layout assuming hot-warm splitting at SplitIndex
307     for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
308       BinaryBasicBlock *BB = BlockOrder[Index];
309       if (BB->getFragmentNum() == FragmentNum::cold())
310         break;
311       BB->setFragmentNum(Index <= SplitIndex ? FragmentNum::main()
312                                              : FragmentNum::warm());
313     }
314     BinaryFunction *BF = BlockOrder[0]->getFunction();
315     BF->getLayout().update(BlockOrder);
316     // Populate BB.OutputAddressRange under the updated layout.
317     BC.calculateEmittedSize(*BF);
318 
319     // Populate BB.OutputAddressRange with estimated new start and end addresses
320     // and compute the old end address of the hot section and the new end
321     // address of the hot section.
322     size_t OldHotEndAddr;
323     size_t NewHotEndAddr;
324     size_t CurrentAddr = BBOffsets[BlockOrder[0]];
325     for (BinaryBasicBlock *BB : BlockOrder) {
326       // We only care about new addresses of blocks in hot/warm.
327       if (BB->getFragmentNum() == FragmentNum::cold())
328         break;
329       const size_t NewSize = BB->getOutputSize();
330       BB->setOutputStartAddress(CurrentAddr);
331       CurrentAddr += NewSize;
332       BB->setOutputEndAddress(CurrentAddr);
333       if (BB->getLayoutIndex() == SplitIndex) {
334         NewHotEndAddr = CurrentAddr;
335         // Approximate the start address of the warm fragment of the current
336         // function using the original hot section size.
337         CurrentAddr = OrigHotSectionSize;
338       }
339       OldHotEndAddr = BBOffsets[BB] + BBSizes[BB];
340     }
341     return std::make_pair(OldHotEndAddr, NewHotEndAddr);
342   }
343 
344   /// Get a collection of "shortenable" calls, that is, calls of type X->Y
345   /// when the function order is [... X ... BF ... Y ...].
346   /// If the hot fragment size of BF is reduced, then such calls are guaranteed
347   /// to get shorter by the reduced hot fragment size.
348   std::vector<CallInfo> extractCoverCalls(const BinaryFunction &BF) {
349     // Record the length and the count of the calls that can be shortened
350     std::vector<CallInfo> CoverCalls;
351     if (opts::CallScale == 0)
352       return CoverCalls;
353 
354     const BinaryFunction *ThisBF = &BF;
355     const BinaryBasicBlock *ThisBB = &(ThisBF->front());
356     const size_t ThisGI = GlobalIndices[ThisBB];
357 
358     for (const BinaryFunction *DstBF : BC.getSortedFunctions()) {
359       if (!shouldConsiderForCallGraph(*DstBF))
360         continue;
361 
362       const BinaryBasicBlock *DstBB = &(DstBF->front());
363       if (DstBB->getKnownExecutionCount() == 0)
364         continue;
365 
366       const size_t DstGI = GlobalIndices[DstBB];
367       for (const BinaryBasicBlock *SrcBB : Callers[DstGI]) {
368         const BinaryFunction *SrcBF = SrcBB->getFunction();
369         if (ThisBF == SrcBF)
370           continue;
371 
372         const size_t CallCount = SrcBB->getKnownExecutionCount();
373 
374         const size_t SrcGI = GlobalIndices[SrcBB];
375 
376         const bool IsCoverCall = (SrcGI < ThisGI && ThisGI < DstGI) ||
377                                  (DstGI <= ThisGI && ThisGI < SrcGI);
378         if (!IsCoverCall)
379           continue;
380 
381         const size_t SrcBBEndAddr = BBOffsets[SrcBB] + BBSizes[SrcBB];
382         const size_t DstBBStartAddr = BBOffsets[DstBB];
383         const size_t CallLength =
384             AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr);
385         const CallInfo CI{CallLength, CallCount};
386         CoverCalls.emplace_back(CI);
387       }
388     }
389     return CoverCalls;
390   }
391 
392   /// Compute the edge score of a call edge.
393   double computeCallScore(uint64_t CallCount, size_t CallLength) {
394     // Increase call lengths by 1 to avoid raising 0 to a negative power.
395     return opts::CallScale * static_cast<double>(CallCount) /
396            std::pow(static_cast<double>(CallLength + 1), opts::CallPower);
397   }
398 
399   /// Compute the edge score of a jump (branch) edge.
400   double computeJumpScore(uint64_t JumpCount, size_t JumpLength) {
401     // Increase jump lengths by 1 to avoid raising 0 to a negative power.
402     return static_cast<double>(JumpCount) /
403            std::pow(static_cast<double>(JumpLength + 1), opts::JumpPower);
404   }
405 
406   /// Compute sum of scores over jumps within \p BlockOrder given \p SplitIndex.
407   /// Increament Score.LocalScore in place by the sum.
408   void computeJumpScore(const BasicBlockOrder &BlockOrder,
409                         const size_t SplitIndex, SplitScore &Score) {
410 
411     for (const BinaryBasicBlock *SrcBB : BlockOrder) {
412       if (SrcBB->getKnownExecutionCount() == 0)
413         continue;
414 
415       const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second;
416 
417       for (const auto Pair : zip(SrcBB->successors(), SrcBB->branch_info())) {
418         const BinaryBasicBlock *DstBB = std::get<0>(Pair);
419         const BinaryBasicBlock::BinaryBranchInfo &Branch = std::get<1>(Pair);
420         const size_t JumpCount = Branch.Count;
421 
422         if (JumpCount == 0)
423           continue;
424 
425         const size_t DstBBStartAddr = DstBB->getOutputAddressRange().first;
426         const size_t NewJumpLength =
427             AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr);
428         Score.LocalScore += computeJumpScore(JumpCount, NewJumpLength);
429       }
430     }
431   }
432 
433   /// Compute sum of scores over calls originated in the current function
434   /// given \p SplitIndex. Increament Score.LocalScore in place by the sum.
435   void computeLocalCallScore(const BasicBlockOrder &BlockOrder,
436                              const size_t SplitIndex, SplitScore &Score) {
437     if (opts::CallScale == 0)
438       return;
439 
440     // Global index of the last block in the current function.
441     // This is later used to determine whether a call originated in the current
442     // function is to a function that comes after the current function.
443     const size_t LastGlobalIndex = GlobalIndices[BlockOrder.back()];
444 
445     // The length of calls originated in the input function can increase /
446     // decrease depending on the splitting decision.
447     for (const BinaryBasicBlock *SrcBB : BlockOrder) {
448       const size_t CallCount = SrcBB->getKnownExecutionCount();
449       // If SrcBB does not call any functions, skip it.
450       if (CallCount == 0)
451         continue;
452 
453       // Obtain an estimate on the end address of the src basic block
454       // after splitting at SplitIndex.
455       const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second;
456 
457       for (const BinaryBasicBlock *DstBB : Callees[GlobalIndices[SrcBB]]) {
458         // Obtain an estimate on the start address of the dst basic block
459         // after splitting at SplitIndex. If DstBB is in a function before
460         // the current function, then its start address remains unchanged.
461         size_t DstBBStartAddr = BBOffsets[DstBB];
462         // If DstBB is in a function after the current function, then its
463         // start address should be adjusted based on the reduction in hot size.
464         if (GlobalIndices[DstBB] > LastGlobalIndex) {
465           assert(DstBBStartAddr >= Score.HotSizeReduction);
466           DstBBStartAddr -= Score.HotSizeReduction;
467         }
468         const size_t NewCallLength =
469             AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr);
470         Score.LocalScore += computeCallScore(CallCount, NewCallLength);
471       }
472     }
473   }
474 
475   /// Compute sum of splitting scores for cover calls of the input function.
476   /// Increament Score.CoverCallScore in place by the sum.
477   void computeCoverCallScore(const BasicBlockOrder &BlockOrder,
478                              const size_t SplitIndex,
479                              const std::vector<CallInfo> &CoverCalls,
480                              SplitScore &Score) {
481     if (opts::CallScale == 0)
482       return;
483 
484     for (const CallInfo CI : CoverCalls) {
485       assert(CI.Length >= Score.HotSizeReduction &&
486              "Length of cover calls must exceed reduced size of hot fragment.");
487       // Compute the new length of the call, which is shorter than the original
488       // one by the size of the splitted fragment minus the total size increase.
489       const size_t NewCallLength = CI.Length - Score.HotSizeReduction;
490       Score.CoverCallScore += computeCallScore(CI.Count, NewCallLength);
491     }
492   }
493 
494   /// Compute the split score of splitting a function at a given index.
495   /// The split score consists of local score and cover score. Cover call score
496   /// is expensive to compute. As a result, we pass in a \p ReferenceScore and
497   /// compute cover score only when the local score exceeds that in the
498   /// ReferenceScore or that the size reduction of the hot fragment is larger
499   /// than that achieved by the split index of the ReferenceScore. This function
500   /// returns \p Score of SplitScore type. It contains the local score and cover
501   /// score (if computed) of the current splitting index. For easier book
502   /// keeping and comparison, it also stores the split index and the resulting
503   /// reduction in hot fragment size.
504   SplitScore computeSplitScore(const BinaryFunction &BF,
505                                const BasicBlockOrder &BlockOrder,
506                                const size_t SplitIndex,
507                                const std::vector<CallInfo> &CoverCalls,
508                                const SplitScore &ReferenceScore) {
509     // Populate BinaryBasicBlock::OutputAddressRange with estimated
510     // new start and end addresses after hot-warm splitting at SplitIndex.
511     size_t OldHotEnd;
512     size_t NewHotEnd;
513     std::tie(OldHotEnd, NewHotEnd) =
514         estimatePostSplitBBAddress(BlockOrder, SplitIndex);
515 
516     SplitScore Score;
517     Score.SplitIndex = SplitIndex;
518 
519     // It's not worth splitting if OldHotEnd < NewHotEnd.
520     if (OldHotEnd < NewHotEnd)
521       return Score;
522 
523     // Hot fragment size reduction due to splitting.
524     Score.HotSizeReduction = OldHotEnd - NewHotEnd;
525 
526     // First part of LocalScore is the sum over call edges originated in the
527     // input function. These edges can get shorter or longer depending on
528     // SplitIndex. Score.LocalScore is increamented in place.
529     computeLocalCallScore(BlockOrder, SplitIndex, Score);
530 
531     // Second part of LocalScore is the sum over jump edges with src basic block
532     // and dst basic block in the current function. Score.LocalScore is
533     // increamented in place.
534     computeJumpScore(BlockOrder, SplitIndex, Score);
535 
536     // There is no need to compute CoverCallScore if we have already found
537     // another split index with a bigger LocalScore and bigger HotSizeReduction.
538     if (Score.LocalScore <= ReferenceScore.LocalScore &&
539         Score.HotSizeReduction <= ReferenceScore.HotSizeReduction)
540       return Score;
541 
542     // Compute CoverCallScore and store in Score in place.
543     computeCoverCallScore(BlockOrder, SplitIndex, CoverCalls, Score);
544     return Score;
545   }
546 
547   /// Find the best index for splitting. The returned value is the index of the
548   /// last hot basic block. Hence, "no splitting" is equivalent to returning the
549   /// value which is one less than the size of the function.
550   size_t findSplitIndex(const BinaryFunction &BF,
551                         const BasicBlockOrder &BlockOrder) {
552     // Find all function calls that can be shortened if we move blocks of the
553     // current function to warm/cold
554     const std::vector<CallInfo> CoverCalls = extractCoverCalls(BF);
555 
556     // Try all possible split indices (blocks with Index <= SplitIndex are in
557     // hot) and find the one maximizing the splitting score.
558     SplitScore BestScore;
559     double BestScoreSum = -1.0;
560     SplitScore ReferenceScore;
561     for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
562       const BinaryBasicBlock *LastHotBB = BlockOrder[Index];
563       // No need to keep cold blocks in the hot section.
564       if (LastHotBB->getFragmentNum() == FragmentNum::cold())
565         break;
566       const SplitScore Score =
567           computeSplitScore(BF, BlockOrder, Index, CoverCalls, ReferenceScore);
568       double ScoreSum = Score.LocalScore + Score.CoverCallScore;
569       if (ScoreSum > BestScoreSum) {
570         BestScoreSum = ScoreSum;
571         BestScore = Score;
572       }
573       if (Score.LocalScore > ReferenceScore.LocalScore)
574         ReferenceScore = Score;
575     }
576 
577     return BestScore.SplitIndex;
578   }
579 };
580 
581 struct SplitRandom2 final : public SplitStrategy {
582   std::minstd_rand0 Gen;
583 
584   SplitRandom2() : Gen(opts::RandomSeed.getValue()) {}
585 
586   bool canSplit(const BinaryFunction &BF) override { return true; }
587 
588   bool compactFragments() override { return true; }
589 
590   void fragment(const BlockIt Start, const BlockIt End) override {
591     using DiffT = typename std::iterator_traits<BlockIt>::difference_type;
592     const DiffT NumBlocks = End - Start;
593     assert(NumBlocks > 0 && "Cannot fragment empty function");
594 
595     // We want to split at least one block
596     const auto LastSplitPoint = std::max<DiffT>(NumBlocks - 1, 1);
597     std::uniform_int_distribution<DiffT> Dist(1, LastSplitPoint);
598     const DiffT SplitPoint = Dist(Gen);
599     for (BinaryBasicBlock *BB : llvm::make_range(Start + SplitPoint, End))
600       BB->setFragmentNum(FragmentNum::cold());
601 
602     LLVM_DEBUG(dbgs() << formatv("BOLT-DEBUG: randomly chose last {0} (out of "
603                                  "{1} possible) blocks to split\n",
604                                  NumBlocks - SplitPoint, End - Start));
605   }
606 };
607 
608 struct SplitRandomN final : public SplitStrategy {
609   std::minstd_rand0 Gen;
610 
611   SplitRandomN() : Gen(opts::RandomSeed.getValue()) {}
612 
613   bool canSplit(const BinaryFunction &BF) override { return true; }
614 
615   bool compactFragments() override { return true; }
616 
617   void fragment(const BlockIt Start, const BlockIt End) override {
618     using DiffT = typename std::iterator_traits<BlockIt>::difference_type;
619     const DiffT NumBlocks = End - Start;
620     assert(NumBlocks > 0 && "Cannot fragment empty function");
621 
622     // With n blocks, there are n-1 places to split them.
623     const DiffT MaximumSplits = NumBlocks - 1;
624     // We want to generate at least two fragment if possible, but if there is
625     // only one block, no splits are possible.
626     const auto MinimumSplits = std::min<DiffT>(MaximumSplits, 1);
627     std::uniform_int_distribution<DiffT> Dist(MinimumSplits, MaximumSplits);
628     // Choose how many splits to perform
629     const DiffT NumSplits = Dist(Gen);
630 
631     // Draw split points from a lottery
632     SmallVector<unsigned, 0> Lottery(MaximumSplits);
633     // Start lottery at 1, because there is no meaningful splitpoint before the
634     // first block.
635     std::iota(Lottery.begin(), Lottery.end(), 1u);
636     std::shuffle(Lottery.begin(), Lottery.end(), Gen);
637     Lottery.resize(NumSplits);
638     llvm::sort(Lottery);
639 
640     // Add one past the end entry to lottery
641     Lottery.push_back(NumBlocks);
642 
643     unsigned LotteryIndex = 0;
644     unsigned BBPos = 0;
645     for (BinaryBasicBlock *const BB : make_range(Start, End)) {
646       // Check whether to start new fragment
647       if (BBPos >= Lottery[LotteryIndex])
648         ++LotteryIndex;
649 
650       // Because LotteryIndex is 0 based and cold fragments are 1 based, we can
651       // use the index to assign fragments.
652       BB->setFragmentNum(FragmentNum(LotteryIndex));
653 
654       ++BBPos;
655     }
656   }
657 };
658 
659 struct SplitAll final : public SplitStrategy {
660   bool canSplit(const BinaryFunction &BF) override { return true; }
661 
662   bool compactFragments() override {
663     // Keeping empty fragments allows us to test, that empty fragments do not
664     // generate symbols.
665     return false;
666   }
667 
668   void fragment(const BlockIt Start, const BlockIt End) override {
669     unsigned Fragment = 0;
670     for (BinaryBasicBlock *const BB : llvm::make_range(Start, End))
671       BB->setFragmentNum(FragmentNum(Fragment++));
672   }
673 };
674 } // namespace
675 
676 namespace llvm {
677 namespace bolt {
678 
679 bool SplitFunctions::shouldOptimize(const BinaryFunction &BF) const {
680   // Apply execution count threshold
681   if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
682     return false;
683 
684   return BinaryFunctionPass::shouldOptimize(BF);
685 }
686 
687 void SplitFunctions::runOnFunctions(BinaryContext &BC) {
688   if (!opts::SplitFunctions)
689     return;
690 
691   // If split strategy is not CDSplit, then a second run of the pass is not
692   // needed after function reordering.
693   if (BC.HasFinalizedFunctionOrder &&
694       opts::SplitStrategy != SplitFunctionsStrategy::CDSplit)
695     return;
696 
697   std::unique_ptr<SplitStrategy> Strategy;
698   bool ForceSequential = false;
699 
700   switch (opts::SplitStrategy) {
701   case SplitFunctionsStrategy::CDSplit:
702     // CDSplit runs two splitting passes: hot-cold splitting (SplitPrfoile2)
703     // before function reordering and hot-warm-cold splitting
704     // (SplitCacheDirected) after function reordering.
705     if (BC.HasFinalizedFunctionOrder)
706       Strategy = std::make_unique<SplitCacheDirected>(BC);
707     else
708       Strategy = std::make_unique<SplitProfile2>();
709     opts::AggressiveSplitting = true;
710     BC.HasWarmSection = true;
711     break;
712   case SplitFunctionsStrategy::Profile2:
713     Strategy = std::make_unique<SplitProfile2>();
714     break;
715   case SplitFunctionsStrategy::Random2:
716     Strategy = std::make_unique<SplitRandom2>();
717     // If we split functions randomly, we need to ensure that across runs with
718     // the same input, we generate random numbers for each function in the same
719     // order.
720     ForceSequential = true;
721     break;
722   case SplitFunctionsStrategy::RandomN:
723     Strategy = std::make_unique<SplitRandomN>();
724     ForceSequential = true;
725     break;
726   case SplitFunctionsStrategy::All:
727     Strategy = std::make_unique<SplitAll>();
728     break;
729   }
730 
731   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
732     return !shouldOptimize(BF);
733   };
734 
735   ParallelUtilities::runOnEachFunction(
736       BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR,
737       [&](BinaryFunction &BF) { splitFunction(BF, *Strategy); }, SkipFunc,
738       "SplitFunctions", ForceSequential);
739 
740   if (SplitBytesHot + SplitBytesCold > 0)
741     outs() << "BOLT-INFO: splitting separates " << SplitBytesHot
742            << " hot bytes from " << SplitBytesCold << " cold bytes "
743            << format("(%.2lf%% of split functions is hot).\n",
744                      100.0 * SplitBytesHot / (SplitBytesHot + SplitBytesCold));
745 }
746 
747 void SplitFunctions::splitFunction(BinaryFunction &BF, SplitStrategy &S) {
748   if (BF.empty())
749     return;
750 
751   if (!S.canSplit(BF))
752     return;
753 
754   FunctionLayout &Layout = BF.getLayout();
755   BinaryFunction::BasicBlockOrderType PreSplitLayout(Layout.block_begin(),
756                                                      Layout.block_end());
757 
758   BinaryContext &BC = BF.getBinaryContext();
759   size_t OriginalHotSize;
760   size_t HotSize;
761   size_t ColdSize;
762   if (BC.isX86()) {
763     std::tie(OriginalHotSize, ColdSize) = BC.calculateEmittedSize(BF);
764     LLVM_DEBUG(dbgs() << "Estimated size for function " << BF
765                       << " pre-split is <0x"
766                       << Twine::utohexstr(OriginalHotSize) << ", 0x"
767                       << Twine::utohexstr(ColdSize) << ">\n");
768   }
769 
770   BinaryFunction::BasicBlockOrderType NewLayout(Layout.block_begin(),
771                                                 Layout.block_end());
772   // Never outline the first basic block.
773   NewLayout.front()->setCanOutline(false);
774   for (BinaryBasicBlock *const BB : NewLayout) {
775     if (!BB->canOutline())
776       continue;
777 
778     // Do not split extra entry points in aarch64. They can be referred by
779     // using ADRs and when this happens, these blocks cannot be placed far
780     // away due to the limited range in ADR instruction.
781     if (BC.isAArch64() && BB->isEntryPoint()) {
782       BB->setCanOutline(false);
783       continue;
784     }
785 
786     if (BF.hasEHRanges() && !opts::SplitEH) {
787       // We cannot move landing pads (or rather entry points for landing pads).
788       if (BB->isLandingPad()) {
789         BB->setCanOutline(false);
790         continue;
791       }
792       // We cannot move a block that can throw since exception-handling
793       // runtime cannot deal with split functions. However, if we can guarantee
794       // that the block never throws, it is safe to move the block to
795       // decrease the size of the function.
796       for (MCInst &Instr : *BB) {
797         if (BC.MIB->isInvoke(Instr)) {
798           BB->setCanOutline(false);
799           break;
800         }
801       }
802     }
803   }
804 
805   BF.getLayout().updateLayoutIndices();
806   S.fragment(NewLayout.begin(), NewLayout.end());
807 
808   // Make sure all non-outlineable blocks are in the main-fragment.
809   for (BinaryBasicBlock *const BB : NewLayout) {
810     if (!BB->canOutline())
811       BB->setFragmentNum(FragmentNum::main());
812   }
813 
814   if (opts::AggressiveSplitting) {
815     // All blocks with 0 count that we can move go to the end of the function.
816     // Even if they were natural to cluster formation and were seen in-between
817     // hot basic blocks.
818     llvm::stable_sort(NewLayout, [&](const BinaryBasicBlock *const A,
819                                      const BinaryBasicBlock *const B) {
820       return A->getFragmentNum() < B->getFragmentNum();
821     });
822   } else if (BF.hasEHRanges() && !opts::SplitEH) {
823     // Typically functions with exception handling have landing pads at the end.
824     // We cannot move beginning of landing pads, but we can move 0-count blocks
825     // comprising landing pads to the end and thus facilitate splitting.
826     auto FirstLP = NewLayout.begin();
827     while ((*FirstLP)->isLandingPad())
828       ++FirstLP;
829 
830     std::stable_sort(FirstLP, NewLayout.end(),
831                      [&](BinaryBasicBlock *A, BinaryBasicBlock *B) {
832                        return A->getFragmentNum() < B->getFragmentNum();
833                      });
834   }
835 
836   // Make sure that fragments are increasing.
837   FragmentNum CurrentFragment = NewLayout.back()->getFragmentNum();
838   for (BinaryBasicBlock *const BB : reverse(NewLayout)) {
839     if (BB->getFragmentNum() > CurrentFragment)
840       BB->setFragmentNum(CurrentFragment);
841     CurrentFragment = BB->getFragmentNum();
842   }
843 
844   if (S.compactFragments()) {
845     FragmentNum CurrentFragment = FragmentNum::main();
846     FragmentNum NewFragment = FragmentNum::main();
847     for (BinaryBasicBlock *const BB : NewLayout) {
848       if (BB->getFragmentNum() > CurrentFragment) {
849         CurrentFragment = BB->getFragmentNum();
850         NewFragment = FragmentNum(NewFragment.get() + 1);
851       }
852       BB->setFragmentNum(NewFragment);
853     }
854   }
855 
856   const bool LayoutUpdated = BF.getLayout().update(NewLayout);
857 
858   // For shared objects, invoke instructions and corresponding landing pads
859   // have to be placed in the same fragment. When we split them, create
860   // trampoline landing pads that will redirect the execution to real LPs.
861   TrampolineSetType Trampolines;
862   if (!BC.HasFixedLoadAddress && BF.hasEHRanges() && BF.isSplit())
863     Trampolines = createEHTrampolines(BF);
864 
865   // Check the new size to see if it's worth splitting the function.
866   if (BC.isX86() && LayoutUpdated) {
867     std::tie(HotSize, ColdSize) = BC.calculateEmittedSize(BF);
868     LLVM_DEBUG(dbgs() << "Estimated size for function " << BF
869                       << " post-split is <0x" << Twine::utohexstr(HotSize)
870                       << ", 0x" << Twine::utohexstr(ColdSize) << ">\n");
871     if (alignTo(OriginalHotSize, opts::SplitAlignThreshold) <=
872         alignTo(HotSize, opts::SplitAlignThreshold) + opts::SplitThreshold) {
873       if (opts::Verbosity >= 2) {
874         outs() << "BOLT-INFO: Reversing splitting of function "
875                << formatv("{0}:\n  {1:x}, {2:x} -> {3:x}\n", BF, HotSize,
876                           ColdSize, OriginalHotSize);
877       }
878 
879       // Reverse the action of createEHTrampolines(). The trampolines will be
880       // placed immediately before the matching destination resulting in no
881       // extra code.
882       if (PreSplitLayout.size() != BF.size())
883         PreSplitLayout = mergeEHTrampolines(BF, PreSplitLayout, Trampolines);
884 
885       for (BinaryBasicBlock &BB : BF)
886         BB.setFragmentNum(FragmentNum::main());
887       BF.getLayout().update(PreSplitLayout);
888     } else {
889       SplitBytesHot += HotSize;
890       SplitBytesCold += ColdSize;
891     }
892   }
893 
894   // Fix branches if the splitting decision of the pass after function
895   // reordering is different from that of the pass before function reordering.
896   if (LayoutUpdated && BC.HasFinalizedFunctionOrder)
897     BF.fixBranches();
898 }
899 
900 SplitFunctions::TrampolineSetType
901 SplitFunctions::createEHTrampolines(BinaryFunction &BF) const {
902   const auto &MIB = BF.getBinaryContext().MIB;
903 
904   // Map real landing pads to the corresponding trampolines.
905   TrampolineSetType LPTrampolines;
906 
907   // Iterate over the copy of basic blocks since we are adding new blocks to the
908   // function which will invalidate its iterators.
909   std::vector<BinaryBasicBlock *> Blocks(BF.pbegin(), BF.pend());
910   for (BinaryBasicBlock *BB : Blocks) {
911     for (MCInst &Instr : *BB) {
912       const std::optional<MCPlus::MCLandingPad> EHInfo = MIB->getEHInfo(Instr);
913       if (!EHInfo || !EHInfo->first)
914         continue;
915 
916       const MCSymbol *LPLabel = EHInfo->first;
917       BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(LPLabel);
918       if (BB->getFragmentNum() == LPBlock->getFragmentNum())
919         continue;
920 
921       const MCSymbol *TrampolineLabel = nullptr;
922       const TrampolineKey Key(BB->getFragmentNum(), LPLabel);
923       auto Iter = LPTrampolines.find(Key);
924       if (Iter != LPTrampolines.end()) {
925         TrampolineLabel = Iter->second;
926       } else {
927         // Create a trampoline basic block in the same fragment as the thrower.
928         // Note: there's no need to insert the jump instruction, it will be
929         // added by fixBranches().
930         BinaryBasicBlock *TrampolineBB = BF.addBasicBlock();
931         TrampolineBB->setFragmentNum(BB->getFragmentNum());
932         TrampolineBB->setExecutionCount(LPBlock->getExecutionCount());
933         TrampolineBB->addSuccessor(LPBlock, TrampolineBB->getExecutionCount());
934         TrampolineBB->setCFIState(LPBlock->getCFIState());
935         TrampolineLabel = TrampolineBB->getLabel();
936         LPTrampolines.insert(std::make_pair(Key, TrampolineLabel));
937       }
938 
939       // Substitute the landing pad with the trampoline.
940       MIB->updateEHInfo(Instr,
941                         MCPlus::MCLandingPad(TrampolineLabel, EHInfo->second));
942     }
943   }
944 
945   if (LPTrampolines.empty())
946     return LPTrampolines;
947 
948   // All trampoline blocks were added to the end of the function. Place them at
949   // the end of corresponding fragments.
950   BinaryFunction::BasicBlockOrderType NewLayout(BF.getLayout().block_begin(),
951                                                 BF.getLayout().block_end());
952   stable_sort(NewLayout, [&](BinaryBasicBlock *A, BinaryBasicBlock *B) {
953     return A->getFragmentNum() < B->getFragmentNum();
954   });
955   BF.getLayout().update(NewLayout);
956 
957   // Conservatively introduce branch instructions.
958   BF.fixBranches();
959 
960   // Update exception-handling CFG for the function.
961   BF.recomputeLandingPads();
962 
963   return LPTrampolines;
964 }
965 
966 SplitFunctions::BasicBlockOrderType SplitFunctions::mergeEHTrampolines(
967     BinaryFunction &BF, SplitFunctions::BasicBlockOrderType &Layout,
968     const SplitFunctions::TrampolineSetType &Trampolines) const {
969   DenseMap<const MCSymbol *, SmallVector<const MCSymbol *, 0>>
970       IncomingTrampolines;
971   for (const auto &Entry : Trampolines) {
972     IncomingTrampolines[Entry.getFirst().Target].emplace_back(
973         Entry.getSecond());
974   }
975 
976   BasicBlockOrderType MergedLayout;
977   for (BinaryBasicBlock *BB : Layout) {
978     auto Iter = IncomingTrampolines.find(BB->getLabel());
979     if (Iter != IncomingTrampolines.end()) {
980       for (const MCSymbol *const Trampoline : Iter->getSecond()) {
981         BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(Trampoline);
982         assert(LPBlock && "Could not find matching landing pad block.");
983         MergedLayout.push_back(LPBlock);
984       }
985     }
986     MergedLayout.push_back(BB);
987   }
988 
989   return MergedLayout;
990 }
991 
992 } // namespace bolt
993 } // namespace llvm
994