xref: /llvm-project/bolt/lib/Passes/SplitFunctions.cpp (revision e001a4e48968f146104cacacb12ba11d316886d2)
1 //===- bolt/Passes/SplitFunctions.cpp - Pass for splitting function code --===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the SplitFunctions pass.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "bolt/Passes/SplitFunctions.h"
14 #include "bolt/Core/BinaryBasicBlock.h"
15 #include "bolt/Core/BinaryFunction.h"
16 #include "bolt/Core/FunctionLayout.h"
17 #include "bolt/Core/ParallelUtilities.h"
18 #include "llvm/ADT/STLExtras.h"
19 #include "llvm/ADT/Sequence.h"
20 #include "llvm/ADT/SmallVector.h"
21 #include "llvm/ADT/iterator_range.h"
22 #include "llvm/Support/CommandLine.h"
23 #include "llvm/Support/FormatVariadic.h"
24 #include <algorithm>
25 #include <iterator>
26 #include <numeric>
27 #include <random>
28 #include <vector>
29 
30 #define DEBUG_TYPE "bolt-opts"
31 
32 using namespace llvm;
33 using namespace bolt;
34 
35 namespace {
36 class DeprecatedSplitFunctionOptionParser : public cl::parser<bool> {
37 public:
38   explicit DeprecatedSplitFunctionOptionParser(cl::Option &O)
39       : cl::parser<bool>(O) {}
40 
41   bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, bool &Value) {
42     if (Arg == "2" || Arg == "3") {
43       Value = true;
44       errs() << formatv("BOLT-WARNING: specifying non-boolean value \"{0}\" "
45                         "for option -{1} is deprecated\n",
46                         Arg, ArgName);
47       return false;
48     }
49     return cl::parser<bool>::parse(O, ArgName, Arg, Value);
50   }
51 };
52 } // namespace
53 
54 namespace opts {
55 
56 extern cl::OptionCategory BoltOptCategory;
57 
58 extern cl::opt<bool> SplitEH;
59 extern cl::opt<unsigned> ExecutionCountThreshold;
60 extern cl::opt<uint32_t> RandomSeed;
61 
62 static cl::opt<bool> AggressiveSplitting(
63     "split-all-cold", cl::desc("outline as many cold basic blocks as possible"),
64     cl::cat(BoltOptCategory));
65 
66 static cl::opt<unsigned> SplitAlignThreshold(
67     "split-align-threshold",
68     cl::desc("when deciding to split a function, apply this alignment "
69              "while doing the size comparison (see -split-threshold). "
70              "Default value: 2."),
71     cl::init(2),
72 
73     cl::Hidden, cl::cat(BoltOptCategory));
74 
75 static cl::opt<bool, false, DeprecatedSplitFunctionOptionParser>
76     SplitFunctions("split-functions",
77                    cl::desc("split functions into fragments"),
78                    cl::cat(BoltOptCategory));
79 
80 static cl::opt<unsigned> SplitThreshold(
81     "split-threshold",
82     cl::desc("split function only if its main size is reduced by more than "
83              "given amount of bytes. Default value: 0, i.e. split iff the "
84              "size is reduced. Note that on some architectures the size can "
85              "increase after splitting."),
86     cl::init(0), cl::Hidden, cl::cat(BoltOptCategory));
87 
88 static cl::opt<SplitFunctionsStrategy> SplitStrategy(
89     "split-strategy", cl::init(SplitFunctionsStrategy::Profile2),
90     cl::values(clEnumValN(SplitFunctionsStrategy::Profile2, "profile2",
91                           "split each function into a hot and cold fragment "
92                           "using profiling information")),
93     cl::values(clEnumValN(
94         SplitFunctionsStrategy::Random2, "random2",
95         "split each function into a hot and cold fragment at a randomly chosen "
96         "split point (ignoring any available profiling information)")),
97     cl::values(clEnumValN(
98         SplitFunctionsStrategy::RandomN, "randomN",
99         "split each function into N fragments at a randomly chosen split "
100         "points (ignoring any available profiling information)")),
101     cl::values(clEnumValN(
102         SplitFunctionsStrategy::All, "all",
103         "split all basic blocks of each function into fragments such that each "
104         "fragment contains exactly a single basic block")),
105     cl::desc("strategy used to partition blocks into fragments"),
106     cl::cat(BoltOptCategory));
107 } // namespace opts
108 
109 namespace {
110 struct SplitProfile2 {
111   bool canSplit(const BinaryFunction &BF) {
112     if (!BF.hasValidProfile())
113       return false;
114 
115     bool AllCold = true;
116     for (const BinaryBasicBlock &BB : BF) {
117       const uint64_t ExecCount = BB.getExecutionCount();
118       if (ExecCount == BinaryBasicBlock::COUNT_NO_PROFILE)
119         return false;
120       if (ExecCount != 0)
121         AllCold = false;
122     }
123 
124     return !AllCold;
125   }
126 
127   bool canOutline(const BinaryBasicBlock &BB) {
128     return BB.getExecutionCount() == 0;
129   }
130 
131   template <typename It> void partition(const It Start, const It End) const {
132     std::for_each(Start, End, [](BinaryBasicBlock *const BB) {
133       assert(BB->canOutline() &&
134              "Moving a block that is not outlineable to cold fragment");
135       BB->setFragmentNum(FragmentNum::cold());
136     });
137   }
138 };
139 
140 struct SplitRandom2 {
141   std::minstd_rand0 *Gen;
142 
143   explicit SplitRandom2(std::minstd_rand0 &Gen) : Gen(&Gen) {}
144 
145   bool canSplit(const BinaryFunction &BF) { return true; }
146   bool canOutline(const BinaryBasicBlock &BB) { return true; }
147 
148   template <typename It> void partition(It Start, It End) const {
149     using DiffT = typename std::iterator_traits<It>::difference_type;
150     const DiffT NumOutlineableBlocks = End - Start;
151 
152     // We want to split at least one block unless there are no blocks that can
153     // be outlined
154     const auto MinimumSplit = std::min<DiffT>(NumOutlineableBlocks, 1);
155     std::uniform_int_distribution<DiffT> Dist(MinimumSplit,
156                                               NumOutlineableBlocks);
157     const DiffT NumColdBlocks = Dist(*Gen);
158     std::for_each(End - NumColdBlocks, End, [](BinaryBasicBlock *BB) {
159       BB->setFragmentNum(FragmentNum::cold());
160     });
161 
162     LLVM_DEBUG(dbgs() << formatv("BOLT-DEBUG: randomly chose last {0} (out of "
163                                  "{1} possible) blocks to split\n",
164                                  NumColdBlocks, End - Start));
165   }
166 };
167 
168 struct SplitRandomN {
169   std::minstd_rand0 *Gen;
170 
171   explicit SplitRandomN(std::minstd_rand0 &Gen) : Gen(&Gen) {}
172 
173   bool canSplit(const BinaryFunction &BF) { return true; }
174   bool canOutline(const BinaryBasicBlock &BB) { return true; }
175 
176   template <typename It> void partition(It Start, It End) const {
177     using DiffT = typename std::iterator_traits<It>::difference_type;
178     const DiffT NumOutlineableBlocks = End - Start;
179 
180     // We want to split at least one fragment if possible
181     const auto MinimumSplits = std::min<DiffT>(NumOutlineableBlocks, 1);
182     std::uniform_int_distribution<DiffT> Dist(MinimumSplits,
183                                               NumOutlineableBlocks);
184     // Choose how many splits to perform
185     const DiffT NumSplits = Dist(*Gen);
186 
187     // Draw split points from a lottery
188     SmallVector<unsigned, 0> Lottery(NumOutlineableBlocks);
189     std::iota(Lottery.begin(), Lottery.end(), 0u);
190     std::shuffle(Lottery.begin(), Lottery.end(), *Gen);
191     Lottery.resize(NumSplits);
192     llvm::sort(Lottery);
193 
194     // Add one past the end entry to lottery
195     Lottery.push_back(NumOutlineableBlocks);
196 
197     unsigned LotteryIndex = 0;
198     unsigned BBPos = 0;
199     for (BinaryBasicBlock *const BB : make_range(Start, End)) {
200       // Check whether to start new fragment
201       if (BBPos >= Lottery[LotteryIndex])
202         ++LotteryIndex;
203 
204       // Because LotteryIndex is 0 based and cold fragments are 1 based, we can
205       // use the index to assign fragments.
206       BB->setFragmentNum(FragmentNum(LotteryIndex));
207 
208       ++BBPos;
209     }
210   }
211 };
212 
213 struct SplitAll {
214   bool canSplit(const BinaryFunction &BF) { return true; }
215   bool canOutline(const BinaryBasicBlock &BB) { return true; }
216 
217   template <typename It> void partition(It Start, It End) const {
218     unsigned Fragment = 1;
219     std::for_each(Start, End, [&](BinaryBasicBlock *const BB) {
220       assert(BB->canOutline() &&
221              "Moving a block that is not outlineable to cold fragment");
222       BB->setFragmentNum(FragmentNum(Fragment++));
223     });
224   }
225 };
226 } // namespace
227 
228 namespace llvm {
229 namespace bolt {
230 
231 bool SplitFunctions::shouldOptimize(const BinaryFunction &BF) const {
232   // Apply execution count threshold
233   if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
234     return false;
235 
236   return BinaryFunctionPass::shouldOptimize(BF);
237 }
238 
239 void SplitFunctions::runOnFunctions(BinaryContext &BC) {
240   if (!opts::SplitFunctions)
241     return;
242 
243   std::minstd_rand0 RandGen(opts::RandomSeed.getValue());
244 
245   ParallelUtilities::WorkFuncTy WorkFun;
246   bool ForceSequential = false;
247 
248   switch (opts::SplitStrategy) {
249   case SplitFunctionsStrategy::Profile2:
250     WorkFun = [&](BinaryFunction &BF) { splitFunction<SplitProfile2>(BF); };
251     break;
252   case SplitFunctionsStrategy::Random2:
253     WorkFun = [&](BinaryFunction &BF) {
254       splitFunction(BF, SplitRandom2(RandGen));
255     };
256     // If we split functions randomly, we need to ensure that across runs with
257     // the same input, we generate random numbers for each function in the same
258     // order.
259     ForceSequential = true;
260     break;
261   case SplitFunctionsStrategy::RandomN:
262     WorkFun = [&](BinaryFunction &BF) {
263       splitFunction(BF, SplitRandomN(RandGen));
264     };
265     ForceSequential = true;
266     break;
267   case SplitFunctionsStrategy::All:
268     WorkFun = [&](BinaryFunction &BF) { splitFunction<SplitAll>(BF); };
269     break;
270   }
271 
272   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
273     return !shouldOptimize(BF);
274   };
275 
276   ParallelUtilities::runOnEachFunction(
277       BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc,
278       "SplitFunctions", ForceSequential);
279 
280   if (SplitBytesHot + SplitBytesCold > 0)
281     outs() << "BOLT-INFO: splitting separates " << SplitBytesHot
282            << " hot bytes from " << SplitBytesCold << " cold bytes "
283            << format("(%.2lf%% of split functions is hot).\n",
284                      100.0 * SplitBytesHot / (SplitBytesHot + SplitBytesCold));
285 }
286 
287 template <typename Strategy>
288 void SplitFunctions::splitFunction(BinaryFunction &BF, Strategy S) {
289   if (BF.empty())
290     return;
291 
292   if (!S.canSplit(BF))
293     return;
294 
295   FunctionLayout &Layout = BF.getLayout();
296   BinaryFunction::BasicBlockOrderType PreSplitLayout(Layout.block_begin(),
297                                                      Layout.block_end());
298 
299   BinaryContext &BC = BF.getBinaryContext();
300   size_t OriginalHotSize;
301   size_t HotSize;
302   size_t ColdSize;
303   if (BC.isX86()) {
304     std::tie(OriginalHotSize, ColdSize) = BC.calculateEmittedSize(BF);
305     LLVM_DEBUG(dbgs() << "Estimated size for function " << BF
306                       << " pre-split is <0x"
307                       << Twine::utohexstr(OriginalHotSize) << ", 0x"
308                       << Twine::utohexstr(ColdSize) << ">\n");
309   }
310 
311   BinaryFunction::BasicBlockOrderType NewLayout(Layout.block_begin(),
312                                                 Layout.block_end());
313   // Never outline the first basic block.
314   NewLayout.front()->setCanOutline(false);
315   for (BinaryBasicBlock *const BB : NewLayout) {
316     if (!BB->canOutline())
317       continue;
318     if (!S.canOutline(*BB)) {
319       BB->setCanOutline(false);
320       continue;
321     }
322     // Do not split extra entry points in aarch64. They can be referred by
323     // using ADRs and when this happens, these blocks cannot be placed far
324     // away due to the limited range in ADR instruction.
325     if (BC.isAArch64() && BB->isEntryPoint()) {
326       BB->setCanOutline(false);
327       continue;
328     }
329 
330     if (BF.hasEHRanges() && !opts::SplitEH) {
331       // We cannot move landing pads (or rather entry points for landing pads).
332       if (BB->isLandingPad()) {
333         BB->setCanOutline(false);
334         continue;
335       }
336       // We cannot move a block that can throw since exception-handling
337       // runtime cannot deal with split functions. However, if we can guarantee
338       // that the block never throws, it is safe to move the block to
339       // decrease the size of the function.
340       for (MCInst &Instr : *BB) {
341         if (BC.MIB->isInvoke(Instr)) {
342           BB->setCanOutline(false);
343           break;
344         }
345       }
346     }
347   }
348 
349   if (opts::AggressiveSplitting) {
350     // All blocks with 0 count that we can move go to the end of the function.
351     // Even if they were natural to cluster formation and were seen in-between
352     // hot basic blocks.
353     stable_sort(NewLayout, [&](BinaryBasicBlock *A, BinaryBasicBlock *B) {
354       return A->canOutline() < B->canOutline();
355     });
356   } else if (BF.hasEHRanges() && !opts::SplitEH) {
357     // Typically functions with exception handling have landing pads at the end.
358     // We cannot move beginning of landing pads, but we can move 0-count blocks
359     // comprising landing pads to the end and thus facilitate splitting.
360     auto FirstLP = NewLayout.begin();
361     while ((*FirstLP)->isLandingPad())
362       ++FirstLP;
363 
364     std::stable_sort(FirstLP, NewLayout.end(),
365                      [&](BinaryBasicBlock *A, BinaryBasicBlock *B) {
366                        return A->canOutline() < B->canOutline();
367                      });
368   }
369 
370   // Identify the last block that must not be split into a fragment. Every block
371   // after this block can be split. Note that when the iterator points to the
372   // block that cannot be outlined, then reverse_iterator::base() points to the
373   // block after it.
374   const BinaryFunction::BasicBlockOrderType::reverse_iterator FirstOutlineable =
375       llvm::find_if(reverse(NewLayout), [](const BinaryBasicBlock *const BB) {
376         return !BB->canOutline();
377       });
378 
379   S.partition(FirstOutlineable.base(), NewLayout.end());
380   BF.getLayout().update(NewLayout);
381 
382   // For shared objects, invoke instructions and corresponding landing pads
383   // have to be placed in the same fragment. When we split them, create
384   // trampoline landing pads that will redirect the execution to real LPs.
385   TrampolineSetType Trampolines;
386   if (!BC.HasFixedLoadAddress && BF.hasEHRanges() && BF.isSplit())
387     Trampolines = createEHTrampolines(BF);
388 
389   // Check the new size to see if it's worth splitting the function.
390   if (BC.isX86() && BF.isSplit()) {
391     std::tie(HotSize, ColdSize) = BC.calculateEmittedSize(BF);
392     LLVM_DEBUG(dbgs() << "Estimated size for function " << BF
393                       << " post-split is <0x" << Twine::utohexstr(HotSize)
394                       << ", 0x" << Twine::utohexstr(ColdSize) << ">\n");
395     if (alignTo(OriginalHotSize, opts::SplitAlignThreshold) <=
396         alignTo(HotSize, opts::SplitAlignThreshold) + opts::SplitThreshold) {
397       LLVM_DEBUG(dbgs() << "Reversing splitting of function " << BF << ":\n  0x"
398                         << Twine::utohexstr(HotSize) << ", 0x"
399                         << Twine::utohexstr(ColdSize) << " -> 0x"
400                         << Twine::utohexstr(OriginalHotSize) << '\n');
401 
402       // Reverse the action of createEHTrampolines(). The trampolines will be
403       // placed immediately before the matching destination resulting in no
404       // extra code.
405       if (PreSplitLayout.size() != BF.size())
406         PreSplitLayout = mergeEHTrampolines(BF, PreSplitLayout, Trampolines);
407 
408       for (BinaryBasicBlock &BB : BF)
409         BB.setFragmentNum(FragmentNum::main());
410       BF.getLayout().update(PreSplitLayout);
411     } else {
412       SplitBytesHot += HotSize;
413       SplitBytesCold += ColdSize;
414     }
415   }
416 }
417 
418 SplitFunctions::TrampolineSetType
419 SplitFunctions::createEHTrampolines(BinaryFunction &BF) const {
420   const auto &MIB = BF.getBinaryContext().MIB;
421 
422   // Map real landing pads to the corresponding trampolines.
423   TrampolineSetType LPTrampolines;
424 
425   // Iterate over the copy of basic blocks since we are adding new blocks to the
426   // function which will invalidate its iterators.
427   std::vector<BinaryBasicBlock *> Blocks(BF.pbegin(), BF.pend());
428   for (BinaryBasicBlock *BB : Blocks) {
429     for (MCInst &Instr : *BB) {
430       const Optional<MCPlus::MCLandingPad> EHInfo = MIB->getEHInfo(Instr);
431       if (!EHInfo || !EHInfo->first)
432         continue;
433 
434       const MCSymbol *LPLabel = EHInfo->first;
435       BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(LPLabel);
436       if (BB->getFragmentNum() == LPBlock->getFragmentNum())
437         continue;
438 
439       const MCSymbol *TrampolineLabel = nullptr;
440       const TrampolineKey Key(BB->getFragmentNum(), LPLabel);
441       auto Iter = LPTrampolines.find(Key);
442       if (Iter != LPTrampolines.end()) {
443         TrampolineLabel = Iter->second;
444       } else {
445         // Create a trampoline basic block in the same fragment as the thrower.
446         // Note: there's no need to insert the jump instruction, it will be
447         // added by fixBranches().
448         BinaryBasicBlock *TrampolineBB = BF.addBasicBlock();
449         TrampolineBB->setFragmentNum(BB->getFragmentNum());
450         TrampolineBB->setExecutionCount(LPBlock->getExecutionCount());
451         TrampolineBB->addSuccessor(LPBlock, TrampolineBB->getExecutionCount());
452         TrampolineBB->setCFIState(LPBlock->getCFIState());
453         TrampolineLabel = TrampolineBB->getLabel();
454         LPTrampolines.insert(std::make_pair(Key, TrampolineLabel));
455       }
456 
457       // Substitute the landing pad with the trampoline.
458       MIB->updateEHInfo(Instr,
459                         MCPlus::MCLandingPad(TrampolineLabel, EHInfo->second));
460     }
461   }
462 
463   if (LPTrampolines.empty())
464     return LPTrampolines;
465 
466   // All trampoline blocks were added to the end of the function. Place them at
467   // the end of corresponding fragments.
468   BinaryFunction::BasicBlockOrderType NewLayout(BF.getLayout().block_begin(),
469                                                 BF.getLayout().block_end());
470   stable_sort(NewLayout, [&](BinaryBasicBlock *A, BinaryBasicBlock *B) {
471     return A->getFragmentNum() < B->getFragmentNum();
472   });
473   BF.getLayout().update(NewLayout);
474 
475   // Conservatively introduce branch instructions.
476   BF.fixBranches();
477 
478   // Update exception-handling CFG for the function.
479   BF.recomputeLandingPads();
480 
481   return LPTrampolines;
482 }
483 
484 SplitFunctions::BasicBlockOrderType SplitFunctions::mergeEHTrampolines(
485     BinaryFunction &BF, SplitFunctions::BasicBlockOrderType &Layout,
486     const SplitFunctions::TrampolineSetType &Trampolines) const {
487   DenseMap<const MCSymbol *, SmallVector<const MCSymbol *, 0>>
488       IncomingTrampolines;
489   for (const auto &Entry : Trampolines) {
490     IncomingTrampolines[Entry.getFirst().Target].emplace_back(
491         Entry.getSecond());
492   }
493 
494   BasicBlockOrderType MergedLayout;
495   for (BinaryBasicBlock *BB : Layout) {
496     auto Iter = IncomingTrampolines.find(BB->getLabel());
497     if (Iter != IncomingTrampolines.end()) {
498       for (const MCSymbol *const Trampoline : Iter->getSecond()) {
499         BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(Trampoline);
500         assert(LPBlock && "Could not find matching landing pad block.");
501         MergedLayout.push_back(LPBlock);
502       }
503     }
504     MergedLayout.push_back(BB);
505   }
506 
507   return MergedLayout;
508 }
509 
510 } // namespace bolt
511 } // namespace llvm
512