xref: /llvm-project/bolt/lib/Passes/SplitFunctions.cpp (revision f428db7a005704b1ccba3f7947ab7bae5b13ec77)
1 //===- bolt/Passes/SplitFunctions.cpp - Pass for splitting function code --===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the SplitFunctions pass.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "bolt/Passes/SplitFunctions.h"
14 #include "bolt/Core/BinaryFunction.h"
15 #include "bolt/Core/FunctionLayout.h"
16 #include "bolt/Core/ParallelUtilities.h"
17 #include "llvm/Support/CommandLine.h"
18 #include "llvm/Support/FormatVariadic.h"
19 #include <algorithm>
20 #include <iterator>
21 #include <random>
22 #include <vector>
23 
24 #define DEBUG_TYPE "bolt-opts"
25 
26 using namespace llvm;
27 using namespace bolt;
28 
29 namespace {
30 class DeprecatedSplitFunctionOptionParser : public cl::parser<bool> {
31 public:
32   explicit DeprecatedSplitFunctionOptionParser(cl::Option &O)
33       : cl::parser<bool>(O) {}
34 
35   bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, bool &Value) {
36     if (Arg == "2" || Arg == "3") {
37       Value = true;
38       errs() << formatv("BOLT-WARNING: specifying non-boolean value \"{0}\" "
39                         "for option -{1} is deprecated\n",
40                         Arg, ArgName);
41       return false;
42     }
43     return cl::parser<bool>::parse(O, ArgName, Arg, Value);
44   }
45 };
46 } // namespace
47 
48 namespace opts {
49 
50 extern cl::OptionCategory BoltOptCategory;
51 
52 extern cl::opt<bool> SplitEH;
53 extern cl::opt<unsigned> ExecutionCountThreshold;
54 extern cl::opt<uint32_t> RandomSeed;
55 
56 static cl::opt<bool> AggressiveSplitting(
57     "split-all-cold", cl::desc("outline as many cold basic blocks as possible"),
58     cl::cat(BoltOptCategory));
59 
60 static cl::opt<unsigned> SplitAlignThreshold(
61     "split-align-threshold",
62     cl::desc("when deciding to split a function, apply this alignment "
63              "while doing the size comparison (see -split-threshold). "
64              "Default value: 2."),
65     cl::init(2),
66 
67     cl::Hidden, cl::cat(BoltOptCategory));
68 
69 static cl::opt<bool, false, DeprecatedSplitFunctionOptionParser>
70     SplitFunctions("split-functions",
71                    cl::desc("split functions into fragments"),
72                    cl::cat(BoltOptCategory));
73 
74 static cl::opt<unsigned> SplitThreshold(
75     "split-threshold",
76     cl::desc("split function only if its main size is reduced by more than "
77              "given amount of bytes. Default value: 0, i.e. split iff the "
78              "size is reduced. Note that on some architectures the size can "
79              "increase after splitting."),
80     cl::init(0), cl::Hidden, cl::cat(BoltOptCategory));
81 
82 static cl::opt<SplitFunctionsStrategy> SplitStrategy(
83     "split-strategy", cl::init(SplitFunctionsStrategy::Profile2),
84     cl::values(clEnumValN(SplitFunctionsStrategy::Profile2, "profile2",
85                           "split each function into a hot and cold fragment "
86                           "using profiling information")),
87     cl::values(clEnumValN(
88         SplitFunctionsStrategy::Random2, "random2",
89         "split each function into a hot and cold fragment at a randomly chosen "
90         "split point (ignoring any available profiling information)")),
91     cl::values(clEnumValN(
92         SplitFunctionsStrategy::All, "all",
93         "split all basic blocks of each function into fragments such that each "
94         "fragment contains exactly a single basic block")),
95     cl::desc("strategy used to partition blocks into fragments"),
96     cl::cat(BoltOptCategory));
97 } // namespace opts
98 
99 namespace {
100 struct SplitProfile2 {
101   bool canSplit(const BinaryFunction &BF) {
102     if (!BF.hasValidProfile())
103       return false;
104 
105     bool AllCold = true;
106     for (const BinaryBasicBlock &BB : BF) {
107       const uint64_t ExecCount = BB.getExecutionCount();
108       if (ExecCount == BinaryBasicBlock::COUNT_NO_PROFILE)
109         return false;
110       if (ExecCount != 0)
111         AllCold = false;
112     }
113 
114     return !AllCold;
115   }
116 
117   bool canOutline(const BinaryBasicBlock &BB) {
118     return BB.getExecutionCount() == 0;
119   }
120 
121   template <typename It> void partition(const It Start, const It End) const {
122     std::for_each(Start, End, [](BinaryBasicBlock *const BB) {
123       assert(BB->canOutline() &&
124              "Moving a block that is not outlineable to cold fragment");
125       BB->setFragmentNum(FragmentNum::cold());
126     });
127   }
128 };
129 
130 struct SplitRandom2 {
131   std::minstd_rand0 *Gen;
132 
133   explicit SplitRandom2(std::minstd_rand0 &Gen) : Gen(&Gen) {}
134 
135   bool canSplit(const BinaryFunction &BF) { return true; }
136   bool canOutline(const BinaryBasicBlock &BB) { return true; }
137 
138   template <typename It> void partition(It Start, It End) const {
139     using DiffT = typename std::iterator_traits<It>::difference_type;
140     const DiffT NumOutlineableBlocks = End - Start;
141 
142     // We want to split at least one block unless there are not blocks that can
143     // be outlined
144     const auto MinimumSplit = std::min<DiffT>(NumOutlineableBlocks, 1);
145     std::uniform_int_distribution<DiffT> Dist(MinimumSplit,
146                                               NumOutlineableBlocks);
147     const DiffT NumColdBlocks = Dist(*Gen);
148     std::for_each(End - NumColdBlocks, End, [](BinaryBasicBlock *BB) {
149       BB->setFragmentNum(FragmentNum::cold());
150     });
151 
152     LLVM_DEBUG(dbgs() << formatv("BOLT-DEBUG: randomly chose last {0} (out of "
153                                  "{1} possible) blocks to split\n",
154                                  NumColdBlocks, End - Start));
155   }
156 };
157 
158 struct SplitAll {
159   bool canSplit(const BinaryFunction &BF) { return true; }
160   bool canOutline(const BinaryBasicBlock &BB) { return true; }
161 
162   template <typename It> void partition(It Start, It End) const {
163     unsigned Fragment = 1;
164     std::for_each(Start, End, [&](BinaryBasicBlock *const BB) {
165       assert(BB->canOutline() &&
166              "Moving a block that is not outlineable to cold fragment");
167       BB->setFragmentNum(FragmentNum(Fragment++));
168     });
169   }
170 };
171 } // namespace
172 
173 namespace llvm {
174 namespace bolt {
175 
176 bool SplitFunctions::shouldOptimize(const BinaryFunction &BF) const {
177   // Apply execution count threshold
178   if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
179     return false;
180 
181   return BinaryFunctionPass::shouldOptimize(BF);
182 }
183 
184 void SplitFunctions::runOnFunctions(BinaryContext &BC) {
185   if (!opts::SplitFunctions)
186     return;
187 
188   std::minstd_rand0 RandGen(opts::RandomSeed.getValue());
189 
190   ParallelUtilities::WorkFuncTy WorkFun;
191   bool ForceSequential = false;
192 
193   switch (opts::SplitStrategy) {
194   case SplitFunctionsStrategy::Profile2:
195     WorkFun = [&](BinaryFunction &BF) { splitFunction<SplitProfile2>(BF); };
196     break;
197   case SplitFunctionsStrategy::Random2:
198     WorkFun = [&](BinaryFunction &BF) {
199       splitFunction(BF, SplitRandom2(RandGen));
200     };
201     // If we split functions randomly, we need to ensure that across runs with
202     // the same input, we generate random numbers for each function in the same
203     // order.
204     ForceSequential = true;
205     break;
206   case SplitFunctionsStrategy::All:
207     WorkFun = [&](BinaryFunction &BF) { splitFunction<SplitAll>(BF); };
208     break;
209   }
210 
211   ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
212     return !shouldOptimize(BF);
213   };
214 
215   ParallelUtilities::runOnEachFunction(
216       BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc,
217       "SplitFunctions", ForceSequential);
218 
219   if (SplitBytesHot + SplitBytesCold > 0)
220     outs() << "BOLT-INFO: splitting separates " << SplitBytesHot
221            << " hot bytes from " << SplitBytesCold << " cold bytes "
222            << format("(%.2lf%% of split functions is hot).\n",
223                      100.0 * SplitBytesHot / (SplitBytesHot + SplitBytesCold));
224 }
225 
226 template <typename Strategy>
227 void SplitFunctions::splitFunction(BinaryFunction &BF, Strategy S) {
228   if (BF.empty())
229     return;
230 
231   if (!S.canSplit(BF))
232     return;
233 
234   FunctionLayout &Layout = BF.getLayout();
235   BinaryFunction::BasicBlockOrderType PreSplitLayout(Layout.block_begin(),
236                                                      Layout.block_end());
237 
238   BinaryContext &BC = BF.getBinaryContext();
239   size_t OriginalHotSize;
240   size_t HotSize;
241   size_t ColdSize;
242   if (BC.isX86()) {
243     std::tie(OriginalHotSize, ColdSize) = BC.calculateEmittedSize(BF);
244     LLVM_DEBUG(dbgs() << "Estimated size for function " << BF
245                       << " pre-split is <0x"
246                       << Twine::utohexstr(OriginalHotSize) << ", 0x"
247                       << Twine::utohexstr(ColdSize) << ">\n");
248   }
249 
250   BinaryFunction::BasicBlockOrderType NewLayout(Layout.block_begin(),
251                                                 Layout.block_end());
252   // Never outline the first basic block.
253   NewLayout.front()->setCanOutline(false);
254   for (BinaryBasicBlock *const BB : NewLayout) {
255     if (!BB->canOutline())
256       continue;
257     if (!S.canOutline(*BB)) {
258       BB->setCanOutline(false);
259       continue;
260     }
261     // Do not split extra entry points in aarch64. They can be referred by
262     // using ADRs and when this happens, these blocks cannot be placed far
263     // away due to the limited range in ADR instruction.
264     if (BC.isAArch64() && BB->isEntryPoint()) {
265       BB->setCanOutline(false);
266       continue;
267     }
268 
269     if (BF.hasEHRanges() && !opts::SplitEH) {
270       // We cannot move landing pads (or rather entry points for landing pads).
271       if (BB->isLandingPad()) {
272         BB->setCanOutline(false);
273         continue;
274       }
275       // We cannot move a block that can throw since exception-handling
276       // runtime cannot deal with split functions. However, if we can guarantee
277       // that the block never throws, it is safe to move the block to
278       // decrease the size of the function.
279       for (MCInst &Instr : *BB) {
280         if (BC.MIB->isInvoke(Instr)) {
281           BB->setCanOutline(false);
282           break;
283         }
284       }
285     }
286   }
287 
288   if (opts::AggressiveSplitting) {
289     // All blocks with 0 count that we can move go to the end of the function.
290     // Even if they were natural to cluster formation and were seen in-between
291     // hot basic blocks.
292     stable_sort(NewLayout, [&](BinaryBasicBlock *A, BinaryBasicBlock *B) {
293       return A->canOutline() < B->canOutline();
294     });
295   } else if (BF.hasEHRanges() && !opts::SplitEH) {
296     // Typically functions with exception handling have landing pads at the end.
297     // We cannot move beginning of landing pads, but we can move 0-count blocks
298     // comprising landing pads to the end and thus facilitate splitting.
299     auto FirstLP = NewLayout.begin();
300     while ((*FirstLP)->isLandingPad())
301       ++FirstLP;
302 
303     std::stable_sort(FirstLP, NewLayout.end(),
304                      [&](BinaryBasicBlock *A, BinaryBasicBlock *B) {
305                        return A->canOutline() < B->canOutline();
306                      });
307   }
308 
309   // Identify the last block that must not be split into a fragment. Every block
310   // after this block can be split. Note that when the iterator points to the
311   // block that cannot be outlined, then reverse_iterator::base() points to the
312   // block after it.
313   const BinaryFunction::BasicBlockOrderType::reverse_iterator FirstOutlineable =
314       llvm::find_if(reverse(NewLayout), [](const BinaryBasicBlock *const BB) {
315         return !BB->canOutline();
316       });
317 
318   S.partition(FirstOutlineable.base(), NewLayout.end());
319   BF.getLayout().update(NewLayout);
320 
321   // For shared objects, invoke instructions and corresponding landing pads
322   // have to be placed in the same fragment. When we split them, create
323   // trampoline landing pads that will redirect the execution to real LPs.
324   TrampolineSetType Trampolines;
325   if (!BC.HasFixedLoadAddress && BF.hasEHRanges() && BF.isSplit())
326     Trampolines = createEHTrampolines(BF);
327 
328   // Check the new size to see if it's worth splitting the function.
329   if (BC.isX86() && BF.isSplit()) {
330     std::tie(HotSize, ColdSize) = BC.calculateEmittedSize(BF);
331     LLVM_DEBUG(dbgs() << "Estimated size for function " << BF
332                       << " post-split is <0x" << Twine::utohexstr(HotSize)
333                       << ", 0x" << Twine::utohexstr(ColdSize) << ">\n");
334     if (alignTo(OriginalHotSize, opts::SplitAlignThreshold) <=
335         alignTo(HotSize, opts::SplitAlignThreshold) + opts::SplitThreshold) {
336       LLVM_DEBUG(dbgs() << "Reversing splitting of function " << BF << ":\n  0x"
337                         << Twine::utohexstr(HotSize) << ", 0x"
338                         << Twine::utohexstr(ColdSize) << " -> 0x"
339                         << Twine::utohexstr(OriginalHotSize) << '\n');
340 
341       // Reverse the action of createEHTrampolines(). The trampolines will be
342       // placed immediately before the matching destination resulting in no
343       // extra code.
344       if (PreSplitLayout.size() != BF.size())
345         PreSplitLayout = mergeEHTrampolines(BF, PreSplitLayout, Trampolines);
346 
347       for (BinaryBasicBlock &BB : BF)
348         BB.setFragmentNum(FragmentNum::main());
349       BF.getLayout().update(PreSplitLayout);
350     } else {
351       SplitBytesHot += HotSize;
352       SplitBytesCold += ColdSize;
353     }
354   }
355 }
356 
357 SplitFunctions::TrampolineSetType
358 SplitFunctions::createEHTrampolines(BinaryFunction &BF) const {
359   const auto &MIB = BF.getBinaryContext().MIB;
360 
361   // Map real landing pads to the corresponding trampolines.
362   TrampolineSetType LPTrampolines;
363 
364   // Iterate over the copy of basic blocks since we are adding new blocks to the
365   // function which will invalidate its iterators.
366   std::vector<BinaryBasicBlock *> Blocks(BF.pbegin(), BF.pend());
367   for (BinaryBasicBlock *BB : Blocks) {
368     for (MCInst &Instr : *BB) {
369       const Optional<MCPlus::MCLandingPad> EHInfo = MIB->getEHInfo(Instr);
370       if (!EHInfo || !EHInfo->first)
371         continue;
372 
373       const MCSymbol *LPLabel = EHInfo->first;
374       BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(LPLabel);
375       if (BB->isCold() == LPBlock->isCold())
376         continue;
377 
378       const MCSymbol *TrampolineLabel = nullptr;
379       auto Iter = LPTrampolines.find(LPLabel);
380       if (Iter != LPTrampolines.end()) {
381         TrampolineLabel = Iter->second;
382       } else {
383         // Create a trampoline basic block in the same fragment as the thrower.
384         // Note: there's no need to insert the jump instruction, it will be
385         // added by fixBranches().
386         BinaryBasicBlock *TrampolineBB = BF.addBasicBlock();
387         TrampolineBB->setIsCold(BB->isCold());
388         TrampolineBB->setExecutionCount(LPBlock->getExecutionCount());
389         TrampolineBB->addSuccessor(LPBlock, TrampolineBB->getExecutionCount());
390         TrampolineBB->setCFIState(LPBlock->getCFIState());
391         TrampolineLabel = TrampolineBB->getLabel();
392         LPTrampolines.insert(std::make_pair(LPLabel, TrampolineLabel));
393       }
394 
395       // Substitute the landing pad with the trampoline.
396       MIB->updateEHInfo(Instr,
397                         MCPlus::MCLandingPad(TrampolineLabel, EHInfo->second));
398     }
399   }
400 
401   if (LPTrampolines.empty())
402     return LPTrampolines;
403 
404   // All trampoline blocks were added to the end of the function. Place them at
405   // the end of corresponding fragments.
406   BinaryFunction::BasicBlockOrderType NewLayout(BF.getLayout().block_begin(),
407                                                 BF.getLayout().block_end());
408   stable_sort(NewLayout, [&](BinaryBasicBlock *A, BinaryBasicBlock *B) {
409     return A->isCold() < B->isCold();
410   });
411   BF.getLayout().update(NewLayout);
412 
413   // Conservatively introduce branch instructions.
414   BF.fixBranches();
415 
416   // Update exception-handling CFG for the function.
417   BF.recomputeLandingPads();
418 
419   return LPTrampolines;
420 }
421 
422 SplitFunctions::BasicBlockOrderType SplitFunctions::mergeEHTrampolines(
423     BinaryFunction &BF, SplitFunctions::BasicBlockOrderType &Layout,
424     const SplitFunctions::TrampolineSetType &Trampolines) const {
425   BasicBlockOrderType MergedLayout;
426   for (BinaryBasicBlock *BB : Layout) {
427     auto Iter = Trampolines.find(BB->getLabel());
428     if (Iter != Trampolines.end()) {
429       BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(Iter->second);
430       assert(LPBlock && "Could not find matching landing pad block.");
431       MergedLayout.push_back(LPBlock);
432     }
433     MergedLayout.push_back(BB);
434   }
435 
436   return MergedLayout;
437 }
438 
439 } // namespace bolt
440 } // namespace llvm
441