xref: /llvm-project/bolt/lib/Passes/SplitFunctions.cpp (revision 4483cf2d8b294aa8718b5488f2033675895369da)
1 //===- bolt/Passes/SplitFunctions.cpp - Pass for splitting function code --===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the SplitFunctions pass.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "bolt/Passes/SplitFunctions.h"
14 #include "bolt/Core/BinaryBasicBlock.h"
15 #include "bolt/Core/BinaryFunction.h"
16 #include "bolt/Core/FunctionLayout.h"
17 #include "bolt/Core/ParallelUtilities.h"
18 #include "bolt/Utils/CommandLineOpts.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/ADT/Sequence.h"
21 #include "llvm/ADT/SmallVector.h"
22 #include "llvm/ADT/iterator_range.h"
23 #include "llvm/Support/CommandLine.h"
24 #include "llvm/Support/FormatVariadic.h"
25 #include <algorithm>
26 #include <iterator>
27 #include <memory>
28 #include <numeric>
29 #include <random>
30 #include <vector>
31 
32 #define DEBUG_TYPE "bolt-opts"
33 
34 using namespace llvm;
35 using namespace bolt;
36 
37 namespace {
38 class DeprecatedSplitFunctionOptionParser : public cl::parser<bool> {
39 public:
40   explicit DeprecatedSplitFunctionOptionParser(cl::Option &O)
41       : cl::parser<bool>(O) {}
42 
43   bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, bool &Value) {
44     if (Arg == "2" || Arg == "3") {
45       Value = true;
46       errs() << formatv("BOLT-WARNING: specifying non-boolean value \"{0}\" "
47                         "for option -{1} is deprecated\n",
48                         Arg, ArgName);
49       return false;
50     }
51     return cl::parser<bool>::parse(O, ArgName, Arg, Value);
52   }
53 };
54 } // namespace
55 
56 namespace opts {
57 
58 extern cl::OptionCategory BoltOptCategory;
59 
60 extern cl::opt<bool> SplitEH;
61 extern cl::opt<unsigned> ExecutionCountThreshold;
62 extern cl::opt<uint32_t> RandomSeed;
63 
64 static cl::opt<bool> AggressiveSplitting(
65     "split-all-cold", cl::desc("outline as many cold basic blocks as possible"),
66     cl::cat(BoltOptCategory));
67 
68 static cl::opt<unsigned> SplitAlignThreshold(
69     "split-align-threshold",
70     cl::desc("when deciding to split a function, apply this alignment "
71              "while doing the size comparison (see -split-threshold). "
72              "Default value: 2."),
73     cl::init(2),
74 
75     cl::Hidden, cl::cat(BoltOptCategory));
76 
77 static cl::opt<bool, false, DeprecatedSplitFunctionOptionParser>
78     SplitFunctions("split-functions",
79                    cl::desc("split functions into fragments"),
80                    cl::cat(BoltOptCategory));
81 
82 static cl::opt<unsigned> SplitThreshold(
83     "split-threshold",
84     cl::desc("split function only if its main size is reduced by more than "
85              "given amount of bytes. Default value: 0, i.e. split iff the "
86              "size is reduced. Note that on some architectures the size can "
87              "increase after splitting."),
88     cl::init(0), cl::Hidden, cl::cat(BoltOptCategory));
89 
90 static cl::opt<SplitFunctionsStrategy> SplitStrategy(
91     "split-strategy", cl::init(SplitFunctionsStrategy::Profile2),
92     cl::values(clEnumValN(SplitFunctionsStrategy::Profile2, "profile2",
93                           "split each function into a hot and cold fragment "
94                           "using profiling information")),
95     cl::values(clEnumValN(SplitFunctionsStrategy::CDSplit, "cdsplit",
96                           "split each function into a hot, warm, and cold "
97                           "fragment using profiling information")),
98     cl::values(clEnumValN(
99         SplitFunctionsStrategy::Random2, "random2",
100         "split each function into a hot and cold fragment at a randomly chosen "
101         "split point (ignoring any available profiling information)")),
102     cl::values(clEnumValN(
103         SplitFunctionsStrategy::RandomN, "randomN",
104         "split each function into N fragments at a randomly chosen split "
105         "points (ignoring any available profiling information)")),
106     cl::values(clEnumValN(
107         SplitFunctionsStrategy::All, "all",
108         "split all basic blocks of each function into fragments such that each "
109         "fragment contains exactly a single basic block")),
110     cl::desc("strategy used to partition blocks into fragments"),
111     cl::cat(BoltOptCategory));
112 
113 static cl::opt<double> CallScale(
114     "call-scale",
115     cl::desc("Call score scale coefficient (when --split-strategy=cdsplit)"),
116     cl::init(0.95), cl::ReallyHidden, cl::cat(BoltOptCategory));
117 
118 static cl::opt<double>
119     CallPower("call-power",
120               cl::desc("Call score power (when --split-strategy=cdsplit)"),
121               cl::init(0.05), cl::ReallyHidden, cl::cat(BoltOptCategory));
122 
123 static cl::opt<double>
124     JumpPower("jump-power",
125               cl::desc("Jump score power (when --split-strategy=cdsplit)"),
126               cl::init(0.15), cl::ReallyHidden, cl::cat(BoltOptCategory));
127 } // namespace opts
128 
129 namespace {
130 bool hasFullProfile(const BinaryFunction &BF) {
131   return llvm::all_of(BF.blocks(), [](const BinaryBasicBlock &BB) {
132     return BB.getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE;
133   });
134 }
135 
136 bool allBlocksCold(const BinaryFunction &BF) {
137   return llvm::all_of(BF.blocks(), [](const BinaryBasicBlock &BB) {
138     return BB.getExecutionCount() == 0;
139   });
140 }
141 
142 struct SplitProfile2 final : public SplitStrategy {
143   bool canSplit(const BinaryFunction &BF) override {
144     return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF);
145   }
146 
147   bool compactFragments() override { return true; }
148 
149   void fragment(const BlockIt Start, const BlockIt End) override {
150     for (BinaryBasicBlock *const BB : llvm::make_range(Start, End)) {
151       if (BB->getExecutionCount() == 0)
152         BB->setFragmentNum(FragmentNum::cold());
153     }
154   }
155 };
156 
157 struct SplitCacheDirected final : public SplitStrategy {
158   BinaryContext &BC;
159   using BasicBlockOrder = BinaryFunction::BasicBlockOrderType;
160 
161   bool canSplit(const BinaryFunction &BF) override {
162     return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF);
163   }
164 
165   explicit SplitCacheDirected(BinaryContext &BC) : BC(BC) {
166     initializeAuxiliaryVariables();
167     buildCallGraph();
168   }
169 
170   // When some functions are hot-warm split and others are hot-warm-cold split,
171   // we do not want to change the fragment numbers of the blocks in the hot-warm
172   // split functions.
173   bool compactFragments() override { return false; }
174 
175   void fragment(const BlockIt Start, const BlockIt End) override {
176     BasicBlockOrder BlockOrder(Start, End);
177     BinaryFunction &BF = *BlockOrder.front()->getFunction();
178 
179     size_t BestSplitIndex = findSplitIndex(BF, BlockOrder);
180 
181     // Assign fragments based on the computed best split index.
182     // All basic blocks with index up to the best split index become hot.
183     // All remaining blocks are warm / cold depending on if count is
184     // greater than zero or not.
185     for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
186       BinaryBasicBlock *BB = BlockOrder[Index];
187       if (Index <= BestSplitIndex)
188         BB->setFragmentNum(FragmentNum::main());
189       else
190         BB->setFragmentNum(BB->getKnownExecutionCount() > 0
191                                ? FragmentNum::warm()
192                                : FragmentNum::cold());
193     }
194   }
195 
196 private:
197   struct JumpInfo {
198     bool HasUncondBranch = false;
199     BinaryBasicBlock *CondSuccessor = nullptr;
200     BinaryBasicBlock *UncondSuccessor = nullptr;
201   };
202 
203   struct CallInfo {
204     size_t Length;
205     size_t Count;
206   };
207 
208   struct SplitScore {
209     size_t SplitIndex;
210     size_t HotSizeReduction = 0;
211     double LocalScore = 0;
212     double CoverCallScore = 0;
213   };
214 
215   // Auxiliary variables used by the algorithm.
216   size_t TotalNumBlocks{0};
217   size_t OrigHotSectionSize{0};
218   DenseMap<const BinaryBasicBlock *, size_t> GlobalIndices;
219   DenseMap<const BinaryBasicBlock *, size_t> BBSizes;
220   DenseMap<const BinaryBasicBlock *, size_t> BBOffsets;
221   DenseMap<const BinaryBasicBlock *, JumpInfo> JumpInfos;
222 
223   // Call graph.
224   std::vector<SmallVector<const BinaryBasicBlock *, 0>> Callers;
225   std::vector<SmallVector<const BinaryBasicBlock *, 0>> Callees;
226 
227   bool shouldConsiderForCallGraph(const BinaryFunction &BF) {
228     // Only a subset of the functions in the binary will be considered
229     // for initializing auxiliary variables and building call graph.
230     return BF.hasValidIndex() && BF.hasValidProfile() && !BF.empty();
231   }
232 
233   void initializeAuxiliaryVariables() {
234     // Gather information about conditional and unconditional successors of
235     // each basic block; this information will be used to estimate block size
236     // increase due to hot-warm splitting.
237     auto analyzeBranches = [&](BinaryBasicBlock &BB) {
238       JumpInfo BBJumpInfo;
239       const MCSymbol *TBB = nullptr;
240       const MCSymbol *FBB = nullptr;
241       MCInst *CondBranch = nullptr;
242       MCInst *UncondBranch = nullptr;
243       if (BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch)) {
244         BBJumpInfo.HasUncondBranch = UncondBranch != nullptr;
245         if (BB.succ_size() == 1) {
246           BBJumpInfo.UncondSuccessor = BB.getSuccessor();
247         } else if (BB.succ_size() == 2) {
248           BBJumpInfo.CondSuccessor = BB.getConditionalSuccessor(true);
249           BBJumpInfo.UncondSuccessor = BB.getConditionalSuccessor(false);
250         }
251       }
252       return BBJumpInfo;
253     };
254 
255     for (BinaryFunction *BF : BC.getSortedFunctions()) {
256       if (!shouldConsiderForCallGraph(*BF))
257         continue;
258 
259       // Calculate the size of each BB after hot-cold splitting.
260       // This populates BinaryBasicBlock::OutputAddressRange which
261       // can be used to compute the size of each BB.
262       BC.calculateEmittedSize(*BF, /*FixBranches=*/true);
263 
264       for (BinaryBasicBlock *BB : BF->getLayout().blocks()) {
265         // Unique global index.
266         GlobalIndices[BB] = TotalNumBlocks;
267         TotalNumBlocks++;
268 
269         // Block size after hot-cold splitting.
270         BBSizes[BB] = BB->getOutputSize();
271 
272         // Hot block offset after hot-cold splitting.
273         BBOffsets[BB] = OrigHotSectionSize;
274         if (!BB->isSplit())
275           OrigHotSectionSize += BBSizes[BB];
276 
277         // (Un)Conditional branch instruction information.
278         JumpInfos[BB] = analyzeBranches(*BB);
279       }
280     }
281   }
282 
283   void buildCallGraph() {
284     Callers.resize(TotalNumBlocks);
285     Callees.resize(TotalNumBlocks);
286     for (const BinaryFunction *SrcFunction : BC.getSortedFunctions()) {
287       if (!shouldConsiderForCallGraph(*SrcFunction))
288         continue;
289 
290       for (BinaryBasicBlock &SrcBB : SrcFunction->blocks()) {
291         // Skip blocks that are not executed
292         if (SrcBB.getKnownExecutionCount() == 0)
293           continue;
294 
295         // Find call instructions and extract target symbols from each one
296         for (const MCInst &Inst : SrcBB) {
297           if (!BC.MIB->isCall(Inst))
298             continue;
299 
300           // Call info
301           const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst);
302           // Ignore calls w/o information
303           if (!DstSym)
304             continue;
305 
306           const BinaryFunction *DstFunction = BC.getFunctionForSymbol(DstSym);
307           // Ignore calls that do not have a valid target, but do not ignore
308           // recursive calls, because caller block could be moved to warm.
309           if (!DstFunction || DstFunction->getLayout().block_empty())
310             continue;
311 
312           const BinaryBasicBlock *DstBB = &(DstFunction->front());
313 
314           // Record the call only if DstBB is also in functions to consider for
315           // call graph.
316           if (GlobalIndices.contains(DstBB)) {
317             Callers[GlobalIndices[DstBB]].push_back(&SrcBB);
318             Callees[GlobalIndices[&SrcBB]].push_back(DstBB);
319           }
320         }
321       }
322     }
323   }
324 
325   /// Populate BinaryBasicBlock::OutputAddressRange with estimated basic block
326   /// start and end addresses for hot and warm basic blocks, assuming hot-warm
327   /// splitting happens at \p SplitIndex. Also return estimated end addresses
328   /// of the hot fragment before and after splitting.
329   /// The estimations take into account the potential addition of branch
330   /// instructions due to split fall through branches as well as the need to
331   /// use longer branch instructions for split (un)conditional branches.
332   std::pair<size_t, size_t>
333   estimatePostSplitBBAddress(const BasicBlockOrder &BlockOrder,
334                              const size_t SplitIndex) {
335     assert(SplitIndex < BlockOrder.size() && "Invalid split index");
336 
337     // Update function layout assuming hot-warm splitting at SplitIndex
338     for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
339       BinaryBasicBlock *BB = BlockOrder[Index];
340       if (BB->getFragmentNum() == FragmentNum::cold())
341         break;
342       BB->setFragmentNum(Index <= SplitIndex ? FragmentNum::main()
343                                              : FragmentNum::warm());
344     }
345     BinaryFunction *BF = BlockOrder[0]->getFunction();
346     BF->getLayout().update(BlockOrder);
347     // Populate BB.OutputAddressRange under the updated layout.
348     BC.calculateEmittedSize(*BF);
349 
350     // Populate BB.OutputAddressRange with estimated new start and end addresses
351     // and compute the old end address of the hot section and the new end
352     // address of the hot section.
353     size_t OldHotEndAddr;
354     size_t NewHotEndAddr;
355     size_t CurrentAddr = BBOffsets[BlockOrder[0]];
356     for (BinaryBasicBlock *BB : BlockOrder) {
357       // We only care about new addresses of blocks in hot/warm.
358       if (BB->getFragmentNum() == FragmentNum::cold())
359         break;
360       const size_t NewSize = BB->getOutputSize();
361       BB->setOutputStartAddress(CurrentAddr);
362       CurrentAddr += NewSize;
363       BB->setOutputEndAddress(CurrentAddr);
364       if (BB->getLayoutIndex() == SplitIndex) {
365         NewHotEndAddr = CurrentAddr;
366         // Approximate the start address of the warm fragment of the current
367         // function using the original hot section size.
368         CurrentAddr = OrigHotSectionSize;
369       }
370       OldHotEndAddr = BBOffsets[BB] + BBSizes[BB];
371     }
372     return std::make_pair(OldHotEndAddr, NewHotEndAddr);
373   }
374 
375   /// Get a collection of "shortenable" calls, that is, calls of type X->Y
376   /// when the function order is [... X ... BF ... Y ...].
377   /// If the hot fragment size of BF is reduced, then such calls are guaranteed
378   /// to get shorter by the reduced hot fragment size.
379   std::vector<CallInfo> extractCoverCalls(const BinaryFunction &BF) {
380     // Record the length and the count of the calls that can be shortened
381     std::vector<CallInfo> CoverCalls;
382     if (opts::CallScale == 0)
383       return CoverCalls;
384 
385     const BinaryFunction *ThisBF = &BF;
386     const BinaryBasicBlock *ThisBB = &(ThisBF->front());
387     const size_t ThisGI = GlobalIndices[ThisBB];
388 
389     for (const BinaryFunction *DstBF : BC.getSortedFunctions()) {
390       if (!shouldConsiderForCallGraph(*DstBF))
391         continue;
392 
393       const BinaryBasicBlock *DstBB = &(DstBF->front());
394       if (DstBB->getKnownExecutionCount() == 0)
395         continue;
396 
397       const size_t DstGI = GlobalIndices[DstBB];
398       for (const BinaryBasicBlock *SrcBB : Callers[DstGI]) {
399         const BinaryFunction *SrcBF = SrcBB->getFunction();
400         if (ThisBF == SrcBF)
401           continue;
402 
403         const size_t CallCount = SrcBB->getKnownExecutionCount();
404 
405         const size_t SrcGI = GlobalIndices[SrcBB];
406 
407         const bool IsCoverCall = (SrcGI < ThisGI && ThisGI < DstGI) ||
408                                  (DstGI <= ThisGI && ThisGI < SrcGI);
409         if (!IsCoverCall)
410           continue;
411 
412         const size_t SrcBBEndAddr = BBOffsets[SrcBB] + BBSizes[SrcBB];
413         const size_t DstBBStartAddr = BBOffsets[DstBB];
414         const size_t CallLength =
415             AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr);
416         const CallInfo CI{CallLength, CallCount};
417         CoverCalls.emplace_back(CI);
418       }
419     }
420     return CoverCalls;
421   }
422 
423   /// Compute the edge score of a call edge.
424   double computeCallScore(uint64_t CallCount, size_t CallLength) {
425     // Increase call lengths by 1 to avoid raising 0 to a negative power.
426     return opts::CallScale * static_cast<double>(CallCount) /
427            std::pow(static_cast<double>(CallLength + 1), opts::CallPower);
428   }
429 
430   /// Compute the edge score of a jump (branch) edge.
431   double computeJumpScore(uint64_t JumpCount, size_t JumpLength) {
432     // Increase jump lengths by 1 to avoid raising 0 to a negative power.
433     return static_cast<double>(JumpCount) /
434            std::pow(static_cast<double>(JumpLength + 1), opts::JumpPower);
435   }
436 
437   /// Compute sum of scores over jumps within \p BlockOrder given \p SplitIndex.
438   /// Increament Score.LocalScore in place by the sum.
439   void computeJumpScore(const BasicBlockOrder &BlockOrder,
440                         const size_t SplitIndex, SplitScore &Score) {
441 
442     for (const BinaryBasicBlock *SrcBB : BlockOrder) {
443       if (SrcBB->getKnownExecutionCount() == 0)
444         continue;
445 
446       const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second;
447 
448       for (const auto Pair : zip(SrcBB->successors(), SrcBB->branch_info())) {
449         const BinaryBasicBlock *DstBB = std::get<0>(Pair);
450         const BinaryBasicBlock::BinaryBranchInfo &Branch = std::get<1>(Pair);
451         const size_t JumpCount = Branch.Count;
452 
453         if (JumpCount == 0)
454           continue;
455 
456         const size_t DstBBStartAddr = DstBB->getOutputAddressRange().first;
457         const size_t NewJumpLength =
458             AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr);
459         Score.LocalScore += computeJumpScore(JumpCount, NewJumpLength);
460       }
461     }
462   }
463 
464   /// Compute sum of scores over calls originated in the current function
465   /// given \p SplitIndex. Increament Score.LocalScore in place by the sum.
466   void computeLocalCallScore(const BasicBlockOrder &BlockOrder,
467                              const size_t SplitIndex, SplitScore &Score) {
468     if (opts::CallScale == 0)
469       return;
470 
471     // Global index of the last block in the current function.
472     // This is later used to determine whether a call originated in the current
473     // function is to a function that comes after the current function.
474     const size_t LastGlobalIndex = GlobalIndices[BlockOrder.back()];
475 
476     // The length of calls originated in the input function can increase /
477     // decrease depending on the splitting decision.
478     for (const BinaryBasicBlock *SrcBB : BlockOrder) {
479       const size_t CallCount = SrcBB->getKnownExecutionCount();
480       // If SrcBB does not call any functions, skip it.
481       if (CallCount == 0)
482         continue;
483 
484       // Obtain an estimate on the end address of the src basic block
485       // after splitting at SplitIndex.
486       const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second;
487 
488       for (const BinaryBasicBlock *DstBB : Callees[GlobalIndices[SrcBB]]) {
489         // Obtain an estimate on the start address of the dst basic block
490         // after splitting at SplitIndex. If DstBB is in a function before
491         // the current function, then its start address remains unchanged.
492         size_t DstBBStartAddr = BBOffsets[DstBB];
493         // If DstBB is in a function after the current function, then its
494         // start address should be adjusted based on the reduction in hot size.
495         if (GlobalIndices[DstBB] > LastGlobalIndex) {
496           assert(DstBBStartAddr >= Score.HotSizeReduction);
497           DstBBStartAddr -= Score.HotSizeReduction;
498         }
499         const size_t NewCallLength =
500             AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr);
501         Score.LocalScore += computeCallScore(CallCount, NewCallLength);
502       }
503     }
504   }
505 
506   /// Compute sum of splitting scores for cover calls of the input function.
507   /// Increament Score.CoverCallScore in place by the sum.
508   void computeCoverCallScore(const BasicBlockOrder &BlockOrder,
509                              const size_t SplitIndex,
510                              const std::vector<CallInfo> &CoverCalls,
511                              SplitScore &Score) {
512     if (opts::CallScale == 0)
513       return;
514 
515     for (const CallInfo CI : CoverCalls) {
516       assert(CI.Length >= Score.HotSizeReduction &&
517              "Length of cover calls must exceed reduced size of hot fragment.");
518       // Compute the new length of the call, which is shorter than the original
519       // one by the size of the splitted fragment minus the total size increase.
520       const size_t NewCallLength = CI.Length - Score.HotSizeReduction;
521       Score.CoverCallScore += computeCallScore(CI.Count, NewCallLength);
522     }
523   }
524 
525   /// Compute the split score of splitting a function at a given index.
526   /// The split score consists of local score and cover score. Cover call score
527   /// is expensive to compute. As a result, we pass in a \p ReferenceScore and
528   /// compute cover score only when the local score exceeds that in the
529   /// ReferenceScore or that the size reduction of the hot fragment is larger
530   /// than that achieved by the split index of the ReferenceScore. This function
531   /// returns \p Score of SplitScore type. It contains the local score and cover
532   /// score (if computed) of the current splitting index. For easier book
533   /// keeping and comparison, it also stores the split index and the resulting
534   /// reduction in hot fragment size.
535   SplitScore computeSplitScore(const BinaryFunction &BF,
536                                const BasicBlockOrder &BlockOrder,
537                                const size_t SplitIndex,
538                                const std::vector<CallInfo> &CoverCalls,
539                                const SplitScore &ReferenceScore) {
540     // Populate BinaryBasicBlock::OutputAddressRange with estimated
541     // new start and end addresses after hot-warm splitting at SplitIndex.
542     size_t OldHotEnd;
543     size_t NewHotEnd;
544     std::tie(OldHotEnd, NewHotEnd) =
545         estimatePostSplitBBAddress(BlockOrder, SplitIndex);
546 
547     SplitScore Score;
548     Score.SplitIndex = SplitIndex;
549 
550     // It's not worth splitting if OldHotEnd < NewHotEnd.
551     if (OldHotEnd < NewHotEnd)
552       return Score;
553 
554     // Hot fragment size reduction due to splitting.
555     Score.HotSizeReduction = OldHotEnd - NewHotEnd;
556 
557     // First part of LocalScore is the sum over call edges originated in the
558     // input function. These edges can get shorter or longer depending on
559     // SplitIndex. Score.LocalScore is increamented in place.
560     computeLocalCallScore(BlockOrder, SplitIndex, Score);
561 
562     // Second part of LocalScore is the sum over jump edges with src basic block
563     // and dst basic block in the current function. Score.LocalScore is
564     // increamented in place.
565     computeJumpScore(BlockOrder, SplitIndex, Score);
566 
567     // There is no need to compute CoverCallScore if we have already found
568     // another split index with a bigger LocalScore and bigger HotSizeReduction.
569     if (Score.LocalScore <= ReferenceScore.LocalScore &&
570         Score.HotSizeReduction <= ReferenceScore.HotSizeReduction)
571       return Score;
572 
573     // Compute CoverCallScore and store in Score in place.
574     computeCoverCallScore(BlockOrder, SplitIndex, CoverCalls, Score);
575     return Score;
576   }
577 
578   /// Find the best index for splitting. The returned value is the index of the
579   /// last hot basic block. Hence, "no splitting" is equivalent to returning the
580   /// value which is one less than the size of the function.
581   size_t findSplitIndex(const BinaryFunction &BF,
582                         const BasicBlockOrder &BlockOrder) {
583     // Find all function calls that can be shortened if we move blocks of the
584     // current function to warm/cold
585     const std::vector<CallInfo> CoverCalls = extractCoverCalls(BF);
586 
587     // Try all possible split indices (blocks with Index <= SplitIndex are in
588     // hot) and find the one maximizing the splitting score.
589     SplitScore BestScore;
590     double BestScoreSum = -1.0;
591     SplitScore ReferenceScore;
592     for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
593       const BinaryBasicBlock *LastHotBB = BlockOrder[Index];
594       // No need to keep cold blocks in the hot section.
595       if (LastHotBB->getFragmentNum() == FragmentNum::cold())
596         break;
597       const SplitScore Score =
598           computeSplitScore(BF, BlockOrder, Index, CoverCalls, ReferenceScore);
599       double ScoreSum = Score.LocalScore + Score.CoverCallScore;
600       if (ScoreSum > BestScoreSum) {
601         BestScoreSum = ScoreSum;
602         BestScore = Score;
603       }
604       if (Score.LocalScore > ReferenceScore.LocalScore)
605         ReferenceScore = Score;
606     }
607 
608     return BestScore.SplitIndex;
609   }
610 };
611 
612 struct SplitRandom2 final : public SplitStrategy {
613   std::minstd_rand0 Gen;
614 
615   SplitRandom2() : Gen(opts::RandomSeed.getValue()) {}
616 
617   bool canSplit(const BinaryFunction &BF) override { return true; }
618 
619   bool compactFragments() override { return true; }
620 
621   void fragment(const BlockIt Start, const BlockIt End) override {
622     using DiffT = typename std::iterator_traits<BlockIt>::difference_type;
623     const DiffT NumBlocks = End - Start;
624     assert(NumBlocks > 0 && "Cannot fragment empty function");
625 
626     // We want to split at least one block
627     const auto LastSplitPoint = std::max<DiffT>(NumBlocks - 1, 1);
628     std::uniform_int_distribution<DiffT> Dist(1, LastSplitPoint);
629     const DiffT SplitPoint = Dist(Gen);
630     for (BinaryBasicBlock *BB : llvm::make_range(Start + SplitPoint, End))
631       BB->setFragmentNum(FragmentNum::cold());
632 
633     LLVM_DEBUG(dbgs() << formatv("BOLT-DEBUG: randomly chose last {0} (out of "
634                                  "{1} possible) blocks to split\n",
635                                  NumBlocks - SplitPoint, End - Start));
636   }
637 };
638 
639 struct SplitRandomN final : public SplitStrategy {
640   std::minstd_rand0 Gen;
641 
642   SplitRandomN() : Gen(opts::RandomSeed.getValue()) {}
643 
644   bool canSplit(const BinaryFunction &BF) override { return true; }
645 
646   bool compactFragments() override { return true; }
647 
648   void fragment(const BlockIt Start, const BlockIt End) override {
649     using DiffT = typename std::iterator_traits<BlockIt>::difference_type;
650     const DiffT NumBlocks = End - Start;
651     assert(NumBlocks > 0 && "Cannot fragment empty function");
652 
653     // With n blocks, there are n-1 places to split them.
654     const DiffT MaximumSplits = NumBlocks - 1;
655     // We want to generate at least two fragment if possible, but if there is
656     // only one block, no splits are possible.
657     const auto MinimumSplits = std::min<DiffT>(MaximumSplits, 1);
658     std::uniform_int_distribution<DiffT> Dist(MinimumSplits, MaximumSplits);
659     // Choose how many splits to perform
660     const DiffT NumSplits = Dist(Gen);
661 
662     // Draw split points from a lottery
663     SmallVector<unsigned, 0> Lottery(MaximumSplits);
664     // Start lottery at 1, because there is no meaningful splitpoint before the
665     // first block.
666     std::iota(Lottery.begin(), Lottery.end(), 1u);
667     std::shuffle(Lottery.begin(), Lottery.end(), Gen);
668     Lottery.resize(NumSplits);
669     llvm::sort(Lottery);
670 
671     // Add one past the end entry to lottery
672     Lottery.push_back(NumBlocks);
673 
674     unsigned LotteryIndex = 0;
675     unsigned BBPos = 0;
676     for (BinaryBasicBlock *const BB : make_range(Start, End)) {
677       // Check whether to start new fragment
678       if (BBPos >= Lottery[LotteryIndex])
679         ++LotteryIndex;
680 
681       // Because LotteryIndex is 0 based and cold fragments are 1 based, we can
682       // use the index to assign fragments.
683       BB->setFragmentNum(FragmentNum(LotteryIndex));
684 
685       ++BBPos;
686     }
687   }
688 };
689 
690 struct SplitAll final : public SplitStrategy {
691   bool canSplit(const BinaryFunction &BF) override { return true; }
692 
693   bool compactFragments() override {
694     // Keeping empty fragments allows us to test, that empty fragments do not
695     // generate symbols.
696     return false;
697   }
698 
699   void fragment(const BlockIt Start, const BlockIt End) override {
700     unsigned Fragment = 0;
701     for (BinaryBasicBlock *const BB : llvm::make_range(Start, End))
702       BB->setFragmentNum(FragmentNum(Fragment++));
703   }
704 };
705 } // namespace
706 
707 namespace llvm {
708 namespace bolt {
709 
710 bool SplitFunctions::shouldOptimize(const BinaryFunction &BF) const {
711   // Apply execution count threshold
712   if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
713     return false;
714 
715   return BinaryFunctionPass::shouldOptimize(BF);
716 }
717 
718 void SplitFunctions::runOnFunctions(BinaryContext &BC) {
719   if (!opts::SplitFunctions)
720     return;
721 
722   // If split strategy is not CDSplit, then a second run of the pass is not
723   // needed after function reordering.
724   if (BC.HasFinalizedFunctionOrder &&
725       opts::SplitStrategy != SplitFunctionsStrategy::CDSplit)
726     return;
727 
728   std::unique_ptr<SplitStrategy> Strategy;
729   bool ForceSequential = false;
730 
731   switch (opts::SplitStrategy) {
732   case SplitFunctionsStrategy::CDSplit:
733     // CDSplit runs two splitting passes: hot-cold splitting (SplitPrfoile2)
734     // before function reordering and hot-warm-cold splitting
735     // (SplitCacheDirected) after function reordering.
736     if (BC.HasFinalizedFunctionOrder)
737       Strategy = std::make_unique<SplitCacheDirected>(BC);
738     else
739       Strategy = std::make_unique<SplitProfile2>();
740     opts::AggressiveSplitting = true;
741     BC.HasWarmSection = true;
742     break;
743   case SplitFunctionsStrategy::Profile2:
744     Strategy = std::make_unique<SplitProfile2>();
745     break;
746   case SplitFunctionsStrategy::Random2:
747     Strategy = std::make_unique<SplitRandom2>();
748     // If we split functions randomly, we need to ensure that across runs with
749     // the same input, we generate random numbers for each function in the same
750     // order.
751     ForceSequential = true;
752     break;
753   case SplitFunctionsStrategy::RandomN:
754     Strategy = std::make_unique<SplitRandomN>();
755     ForceSequential = true;
756     break;
757   case SplitFunctionsStrategy::All:
758     Strategy = std::make_unique<SplitAll>();
759     break;
760   }
761 
762   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
763     return !shouldOptimize(BF);
764   };
765 
766   ParallelUtilities::runOnEachFunction(
767       BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR,
768       [&](BinaryFunction &BF) { splitFunction(BF, *Strategy); }, SkipFunc,
769       "SplitFunctions", ForceSequential);
770 
771   if (SplitBytesHot + SplitBytesCold > 0)
772     outs() << "BOLT-INFO: splitting separates " << SplitBytesHot
773            << " hot bytes from " << SplitBytesCold << " cold bytes "
774            << format("(%.2lf%% of split functions is hot).\n",
775                      100.0 * SplitBytesHot / (SplitBytesHot + SplitBytesCold));
776 }
777 
778 void SplitFunctions::splitFunction(BinaryFunction &BF, SplitStrategy &S) {
779   if (BF.empty())
780     return;
781 
782   if (!S.canSplit(BF))
783     return;
784 
785   FunctionLayout &Layout = BF.getLayout();
786   BinaryFunction::BasicBlockOrderType PreSplitLayout(Layout.block_begin(),
787                                                      Layout.block_end());
788 
789   BinaryContext &BC = BF.getBinaryContext();
790   size_t OriginalHotSize;
791   size_t HotSize;
792   size_t ColdSize;
793   if (BC.isX86()) {
794     std::tie(OriginalHotSize, ColdSize) = BC.calculateEmittedSize(BF);
795     LLVM_DEBUG(dbgs() << "Estimated size for function " << BF
796                       << " pre-split is <0x"
797                       << Twine::utohexstr(OriginalHotSize) << ", 0x"
798                       << Twine::utohexstr(ColdSize) << ">\n");
799   }
800 
801   BinaryFunction::BasicBlockOrderType NewLayout(Layout.block_begin(),
802                                                 Layout.block_end());
803   // Never outline the first basic block.
804   NewLayout.front()->setCanOutline(false);
805   for (BinaryBasicBlock *const BB : NewLayout) {
806     if (!BB->canOutline())
807       continue;
808 
809     // Do not split extra entry points in aarch64. They can be referred by
810     // using ADRs and when this happens, these blocks cannot be placed far
811     // away due to the limited range in ADR instruction.
812     if (BC.isAArch64() && BB->isEntryPoint()) {
813       BB->setCanOutline(false);
814       continue;
815     }
816 
817     if (BF.hasEHRanges() && !opts::SplitEH) {
818       // We cannot move landing pads (or rather entry points for landing pads).
819       if (BB->isLandingPad()) {
820         BB->setCanOutline(false);
821         continue;
822       }
823       // We cannot move a block that can throw since exception-handling
824       // runtime cannot deal with split functions. However, if we can guarantee
825       // that the block never throws, it is safe to move the block to
826       // decrease the size of the function.
827       for (MCInst &Instr : *BB) {
828         if (BC.MIB->isInvoke(Instr)) {
829           BB->setCanOutline(false);
830           break;
831         }
832       }
833     }
834   }
835 
836   BF.getLayout().updateLayoutIndices();
837   S.fragment(NewLayout.begin(), NewLayout.end());
838 
839   // Make sure all non-outlineable blocks are in the main-fragment.
840   for (BinaryBasicBlock *const BB : NewLayout) {
841     if (!BB->canOutline())
842       BB->setFragmentNum(FragmentNum::main());
843   }
844 
845   if (opts::AggressiveSplitting) {
846     // All blocks with 0 count that we can move go to the end of the function.
847     // Even if they were natural to cluster formation and were seen in-between
848     // hot basic blocks.
849     llvm::stable_sort(NewLayout, [&](const BinaryBasicBlock *const A,
850                                      const BinaryBasicBlock *const B) {
851       return A->getFragmentNum() < B->getFragmentNum();
852     });
853   } else if (BF.hasEHRanges() && !opts::SplitEH) {
854     // Typically functions with exception handling have landing pads at the end.
855     // We cannot move beginning of landing pads, but we can move 0-count blocks
856     // comprising landing pads to the end and thus facilitate splitting.
857     auto FirstLP = NewLayout.begin();
858     while ((*FirstLP)->isLandingPad())
859       ++FirstLP;
860 
861     std::stable_sort(FirstLP, NewLayout.end(),
862                      [&](BinaryBasicBlock *A, BinaryBasicBlock *B) {
863                        return A->getFragmentNum() < B->getFragmentNum();
864                      });
865   }
866 
867   // Make sure that fragments are increasing.
868   FragmentNum CurrentFragment = NewLayout.back()->getFragmentNum();
869   for (BinaryBasicBlock *const BB : reverse(NewLayout)) {
870     if (BB->getFragmentNum() > CurrentFragment)
871       BB->setFragmentNum(CurrentFragment);
872     CurrentFragment = BB->getFragmentNum();
873   }
874 
875   if (S.compactFragments()) {
876     FragmentNum CurrentFragment = FragmentNum::main();
877     FragmentNum NewFragment = FragmentNum::main();
878     for (BinaryBasicBlock *const BB : NewLayout) {
879       if (BB->getFragmentNum() > CurrentFragment) {
880         CurrentFragment = BB->getFragmentNum();
881         NewFragment = FragmentNum(NewFragment.get() + 1);
882       }
883       BB->setFragmentNum(NewFragment);
884     }
885   }
886 
887   const bool LayoutUpdated = BF.getLayout().update(NewLayout);
888 
889   // For shared objects, invoke instructions and corresponding landing pads
890   // have to be placed in the same fragment. When we split them, create
891   // trampoline landing pads that will redirect the execution to real LPs.
892   TrampolineSetType Trampolines;
893   if (!BC.HasFixedLoadAddress && BF.hasEHRanges() && BF.isSplit())
894     Trampolines = createEHTrampolines(BF);
895 
896   // Check the new size to see if it's worth splitting the function.
897   if (BC.isX86() && LayoutUpdated) {
898     std::tie(HotSize, ColdSize) = BC.calculateEmittedSize(BF);
899     LLVM_DEBUG(dbgs() << "Estimated size for function " << BF
900                       << " post-split is <0x" << Twine::utohexstr(HotSize)
901                       << ", 0x" << Twine::utohexstr(ColdSize) << ">\n");
902     if (alignTo(OriginalHotSize, opts::SplitAlignThreshold) <=
903         alignTo(HotSize, opts::SplitAlignThreshold) + opts::SplitThreshold) {
904       if (opts::Verbosity >= 2) {
905         outs() << "BOLT-INFO: Reversing splitting of function "
906                << formatv("{0}:\n  {1:x}, {2:x} -> {3:x}\n", BF, HotSize,
907                           ColdSize, OriginalHotSize);
908       }
909 
910       // Reverse the action of createEHTrampolines(). The trampolines will be
911       // placed immediately before the matching destination resulting in no
912       // extra code.
913       if (PreSplitLayout.size() != BF.size())
914         PreSplitLayout = mergeEHTrampolines(BF, PreSplitLayout, Trampolines);
915 
916       for (BinaryBasicBlock &BB : BF)
917         BB.setFragmentNum(FragmentNum::main());
918       BF.getLayout().update(PreSplitLayout);
919     } else {
920       SplitBytesHot += HotSize;
921       SplitBytesCold += ColdSize;
922     }
923   }
924 
925   // Fix branches if the splitting decision of the pass after function
926   // reordering is different from that of the pass before function reordering.
927   if (LayoutUpdated && BC.HasFinalizedFunctionOrder)
928     BF.fixBranches();
929 }
930 
931 SplitFunctions::TrampolineSetType
932 SplitFunctions::createEHTrampolines(BinaryFunction &BF) const {
933   const auto &MIB = BF.getBinaryContext().MIB;
934 
935   // Map real landing pads to the corresponding trampolines.
936   TrampolineSetType LPTrampolines;
937 
938   // Iterate over the copy of basic blocks since we are adding new blocks to the
939   // function which will invalidate its iterators.
940   std::vector<BinaryBasicBlock *> Blocks(BF.pbegin(), BF.pend());
941   for (BinaryBasicBlock *BB : Blocks) {
942     for (MCInst &Instr : *BB) {
943       const std::optional<MCPlus::MCLandingPad> EHInfo = MIB->getEHInfo(Instr);
944       if (!EHInfo || !EHInfo->first)
945         continue;
946 
947       const MCSymbol *LPLabel = EHInfo->first;
948       BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(LPLabel);
949       if (BB->getFragmentNum() == LPBlock->getFragmentNum())
950         continue;
951 
952       const MCSymbol *TrampolineLabel = nullptr;
953       const TrampolineKey Key(BB->getFragmentNum(), LPLabel);
954       auto Iter = LPTrampolines.find(Key);
955       if (Iter != LPTrampolines.end()) {
956         TrampolineLabel = Iter->second;
957       } else {
958         // Create a trampoline basic block in the same fragment as the thrower.
959         // Note: there's no need to insert the jump instruction, it will be
960         // added by fixBranches().
961         BinaryBasicBlock *TrampolineBB = BF.addBasicBlock();
962         TrampolineBB->setFragmentNum(BB->getFragmentNum());
963         TrampolineBB->setExecutionCount(LPBlock->getExecutionCount());
964         TrampolineBB->addSuccessor(LPBlock, TrampolineBB->getExecutionCount());
965         TrampolineBB->setCFIState(LPBlock->getCFIState());
966         TrampolineLabel = TrampolineBB->getLabel();
967         LPTrampolines.insert(std::make_pair(Key, TrampolineLabel));
968       }
969 
970       // Substitute the landing pad with the trampoline.
971       MIB->updateEHInfo(Instr,
972                         MCPlus::MCLandingPad(TrampolineLabel, EHInfo->second));
973     }
974   }
975 
976   if (LPTrampolines.empty())
977     return LPTrampolines;
978 
979   // All trampoline blocks were added to the end of the function. Place them at
980   // the end of corresponding fragments.
981   BinaryFunction::BasicBlockOrderType NewLayout(BF.getLayout().block_begin(),
982                                                 BF.getLayout().block_end());
983   stable_sort(NewLayout, [&](BinaryBasicBlock *A, BinaryBasicBlock *B) {
984     return A->getFragmentNum() < B->getFragmentNum();
985   });
986   BF.getLayout().update(NewLayout);
987 
988   // Conservatively introduce branch instructions.
989   BF.fixBranches();
990 
991   // Update exception-handling CFG for the function.
992   BF.recomputeLandingPads();
993 
994   return LPTrampolines;
995 }
996 
997 SplitFunctions::BasicBlockOrderType SplitFunctions::mergeEHTrampolines(
998     BinaryFunction &BF, SplitFunctions::BasicBlockOrderType &Layout,
999     const SplitFunctions::TrampolineSetType &Trampolines) const {
1000   DenseMap<const MCSymbol *, SmallVector<const MCSymbol *, 0>>
1001       IncomingTrampolines;
1002   for (const auto &Entry : Trampolines) {
1003     IncomingTrampolines[Entry.getFirst().Target].emplace_back(
1004         Entry.getSecond());
1005   }
1006 
1007   BasicBlockOrderType MergedLayout;
1008   for (BinaryBasicBlock *BB : Layout) {
1009     auto Iter = IncomingTrampolines.find(BB->getLabel());
1010     if (Iter != IncomingTrampolines.end()) {
1011       for (const MCSymbol *const Trampoline : Iter->getSecond()) {
1012         BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(Trampoline);
1013         assert(LPBlock && "Could not find matching landing pad block.");
1014         MergedLayout.push_back(LPBlock);
1015       }
1016     }
1017     MergedLayout.push_back(BB);
1018   }
1019 
1020   return MergedLayout;
1021 }
1022 
1023 } // namespace bolt
1024 } // namespace llvm
1025