1 //===- bolt/Passes/SplitFunctions.cpp - Pass for splitting function code --===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the SplitFunctions pass. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "bolt/Passes/SplitFunctions.h" 14 #include "bolt/Core/BinaryFunction.h" 15 #include "bolt/Core/FunctionLayout.h" 16 #include "bolt/Core/ParallelUtilities.h" 17 #include "llvm/Support/CommandLine.h" 18 #include "llvm/Support/FormatVariadic.h" 19 #include <algorithm> 20 #include <iterator> 21 #include <random> 22 #include <vector> 23 24 #define DEBUG_TYPE "bolt-opts" 25 26 using namespace llvm; 27 using namespace bolt; 28 29 namespace { 30 class DeprecatedSplitFunctionOptionParser : public cl::parser<bool> { 31 public: 32 explicit DeprecatedSplitFunctionOptionParser(cl::Option &O) 33 : cl::parser<bool>(O) {} 34 35 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, bool &Value) { 36 if (Arg == "2" || Arg == "3") { 37 Value = true; 38 errs() << formatv("BOLT-WARNING: specifying non-boolean value \"{0}\" " 39 "for option -{1} is deprecated\n", 40 Arg, ArgName); 41 return false; 42 } 43 return cl::parser<bool>::parse(O, ArgName, Arg, Value); 44 } 45 }; 46 } // namespace 47 48 namespace opts { 49 50 extern cl::OptionCategory BoltOptCategory; 51 52 extern cl::opt<bool> SplitEH; 53 extern cl::opt<unsigned> ExecutionCountThreshold; 54 extern cl::opt<uint32_t> RandomSeed; 55 56 static cl::opt<bool> AggressiveSplitting( 57 "split-all-cold", cl::desc("outline as many cold basic blocks as possible"), 58 cl::cat(BoltOptCategory)); 59 60 static cl::opt<unsigned> SplitAlignThreshold( 61 "split-align-threshold", 62 cl::desc("when deciding to split a function, apply this alignment " 63 "while doing the size comparison (see -split-threshold). " 64 "Default value: 2."), 65 cl::init(2), 66 67 cl::Hidden, cl::cat(BoltOptCategory)); 68 69 static cl::opt<bool, false, DeprecatedSplitFunctionOptionParser> 70 SplitFunctions("split-functions", 71 cl::desc("split functions into fragments"), 72 cl::cat(BoltOptCategory)); 73 74 static cl::opt<unsigned> SplitThreshold( 75 "split-threshold", 76 cl::desc("split function only if its main size is reduced by more than " 77 "given amount of bytes. Default value: 0, i.e. split iff the " 78 "size is reduced. Note that on some architectures the size can " 79 "increase after splitting."), 80 cl::init(0), cl::Hidden, cl::cat(BoltOptCategory)); 81 82 static cl::opt<SplitFunctionsStrategy> SplitStrategy( 83 "split-strategy", cl::init(SplitFunctionsStrategy::Profile2), 84 cl::values(clEnumValN(SplitFunctionsStrategy::Profile2, "profile2", 85 "split each function into a hot and cold fragment " 86 "using profiling information")), 87 cl::values(clEnumValN( 88 SplitFunctionsStrategy::Random2, "random2", 89 "split each function into a hot and cold fragment at a randomly chosen " 90 "split point (ignoring any available profiling information)")), 91 cl::values(clEnumValN( 92 SplitFunctionsStrategy::All, "all", 93 "split all basic blocks of each function into fragments such that each " 94 "fragment contains exactly a single basic block")), 95 cl::desc("strategy used to partition blocks into fragments"), 96 cl::cat(BoltOptCategory)); 97 } // namespace opts 98 99 namespace { 100 struct SplitProfile2 { 101 bool canSplit(const BinaryFunction &BF) { 102 if (!BF.hasValidProfile()) 103 return false; 104 105 bool AllCold = true; 106 for (const BinaryBasicBlock &BB : BF) { 107 const uint64_t ExecCount = BB.getExecutionCount(); 108 if (ExecCount == BinaryBasicBlock::COUNT_NO_PROFILE) 109 return false; 110 if (ExecCount != 0) 111 AllCold = false; 112 } 113 114 return !AllCold; 115 } 116 117 bool canOutline(const BinaryBasicBlock &BB) { 118 return BB.getExecutionCount() == 0; 119 } 120 121 template <typename It> void partition(const It Start, const It End) const { 122 std::for_each(Start, End, [](BinaryBasicBlock *const BB) { 123 assert(BB->canOutline() && 124 "Moving a block that is not outlineable to cold fragment"); 125 BB->setFragmentNum(FragmentNum::cold()); 126 }); 127 } 128 }; 129 130 struct SplitRandom2 { 131 std::minstd_rand0 *Gen; 132 133 explicit SplitRandom2(std::minstd_rand0 &Gen) : Gen(&Gen) {} 134 135 bool canSplit(const BinaryFunction &BF) { return true; } 136 bool canOutline(const BinaryBasicBlock &BB) { return true; } 137 138 template <typename It> void partition(It Start, It End) const { 139 using DiffT = typename std::iterator_traits<It>::difference_type; 140 const DiffT NumOutlineableBlocks = End - Start; 141 142 // We want to split at least one block unless there are not blocks that can 143 // be outlined 144 const auto MinimumSplit = std::min<DiffT>(NumOutlineableBlocks, 1); 145 std::uniform_int_distribution<DiffT> Dist(MinimumSplit, 146 NumOutlineableBlocks); 147 const DiffT NumColdBlocks = Dist(*Gen); 148 std::for_each(End - NumColdBlocks, End, [](BinaryBasicBlock *BB) { 149 BB->setFragmentNum(FragmentNum::cold()); 150 }); 151 152 LLVM_DEBUG(dbgs() << formatv("BOLT-DEBUG: randomly chose last {0} (out of " 153 "{1} possible) blocks to split\n", 154 NumColdBlocks, End - Start)); 155 } 156 }; 157 158 struct SplitAll { 159 bool canSplit(const BinaryFunction &BF) { return true; } 160 bool canOutline(const BinaryBasicBlock &BB) { return true; } 161 162 template <typename It> void partition(It Start, It End) const { 163 unsigned Fragment = 1; 164 std::for_each(Start, End, [&](BinaryBasicBlock *const BB) { 165 assert(BB->canOutline() && 166 "Moving a block that is not outlineable to cold fragment"); 167 BB->setFragmentNum(FragmentNum(Fragment++)); 168 }); 169 } 170 }; 171 } // namespace 172 173 namespace llvm { 174 namespace bolt { 175 176 bool SplitFunctions::shouldOptimize(const BinaryFunction &BF) const { 177 // Apply execution count threshold 178 if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold) 179 return false; 180 181 return BinaryFunctionPass::shouldOptimize(BF); 182 } 183 184 void SplitFunctions::runOnFunctions(BinaryContext &BC) { 185 if (!opts::SplitFunctions) 186 return; 187 188 std::minstd_rand0 RandGen(opts::RandomSeed.getValue()); 189 190 ParallelUtilities::WorkFuncTy WorkFun; 191 bool ForceSequential = false; 192 193 switch (opts::SplitStrategy) { 194 case SplitFunctionsStrategy::Profile2: 195 WorkFun = [&](BinaryFunction &BF) { splitFunction<SplitProfile2>(BF); }; 196 break; 197 case SplitFunctionsStrategy::Random2: 198 WorkFun = [&](BinaryFunction &BF) { 199 splitFunction(BF, SplitRandom2(RandGen)); 200 }; 201 // If we split functions randomly, we need to ensure that across runs with 202 // the same input, we generate random numbers for each function in the same 203 // order. 204 ForceSequential = true; 205 break; 206 case SplitFunctionsStrategy::All: 207 WorkFun = [&](BinaryFunction &BF) { splitFunction<SplitAll>(BF); }; 208 break; 209 } 210 211 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 212 return !shouldOptimize(BF); 213 }; 214 215 ParallelUtilities::runOnEachFunction( 216 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc, 217 "SplitFunctions", ForceSequential); 218 219 if (SplitBytesHot + SplitBytesCold > 0) 220 outs() << "BOLT-INFO: splitting separates " << SplitBytesHot 221 << " hot bytes from " << SplitBytesCold << " cold bytes " 222 << format("(%.2lf%% of split functions is hot).\n", 223 100.0 * SplitBytesHot / (SplitBytesHot + SplitBytesCold)); 224 } 225 226 template <typename Strategy> 227 void SplitFunctions::splitFunction(BinaryFunction &BF, Strategy S) { 228 if (BF.empty()) 229 return; 230 231 if (!S.canSplit(BF)) 232 return; 233 234 FunctionLayout &Layout = BF.getLayout(); 235 BinaryFunction::BasicBlockOrderType PreSplitLayout(Layout.block_begin(), 236 Layout.block_end()); 237 238 BinaryContext &BC = BF.getBinaryContext(); 239 size_t OriginalHotSize; 240 size_t HotSize; 241 size_t ColdSize; 242 if (BC.isX86()) { 243 std::tie(OriginalHotSize, ColdSize) = BC.calculateEmittedSize(BF); 244 LLVM_DEBUG(dbgs() << "Estimated size for function " << BF 245 << " pre-split is <0x" 246 << Twine::utohexstr(OriginalHotSize) << ", 0x" 247 << Twine::utohexstr(ColdSize) << ">\n"); 248 } 249 250 BinaryFunction::BasicBlockOrderType NewLayout(Layout.block_begin(), 251 Layout.block_end()); 252 // Never outline the first basic block. 253 NewLayout.front()->setCanOutline(false); 254 for (BinaryBasicBlock *const BB : NewLayout) { 255 if (!BB->canOutline()) 256 continue; 257 if (!S.canOutline(*BB)) { 258 BB->setCanOutline(false); 259 continue; 260 } 261 // Do not split extra entry points in aarch64. They can be referred by 262 // using ADRs and when this happens, these blocks cannot be placed far 263 // away due to the limited range in ADR instruction. 264 if (BC.isAArch64() && BB->isEntryPoint()) { 265 BB->setCanOutline(false); 266 continue; 267 } 268 269 if (BF.hasEHRanges() && !opts::SplitEH) { 270 // We cannot move landing pads (or rather entry points for landing pads). 271 if (BB->isLandingPad()) { 272 BB->setCanOutline(false); 273 continue; 274 } 275 // We cannot move a block that can throw since exception-handling 276 // runtime cannot deal with split functions. However, if we can guarantee 277 // that the block never throws, it is safe to move the block to 278 // decrease the size of the function. 279 for (MCInst &Instr : *BB) { 280 if (BC.MIB->isInvoke(Instr)) { 281 BB->setCanOutline(false); 282 break; 283 } 284 } 285 } 286 } 287 288 if (opts::AggressiveSplitting) { 289 // All blocks with 0 count that we can move go to the end of the function. 290 // Even if they were natural to cluster formation and were seen in-between 291 // hot basic blocks. 292 stable_sort(NewLayout, [&](BinaryBasicBlock *A, BinaryBasicBlock *B) { 293 return A->canOutline() < B->canOutline(); 294 }); 295 } else if (BF.hasEHRanges() && !opts::SplitEH) { 296 // Typically functions with exception handling have landing pads at the end. 297 // We cannot move beginning of landing pads, but we can move 0-count blocks 298 // comprising landing pads to the end and thus facilitate splitting. 299 auto FirstLP = NewLayout.begin(); 300 while ((*FirstLP)->isLandingPad()) 301 ++FirstLP; 302 303 std::stable_sort(FirstLP, NewLayout.end(), 304 [&](BinaryBasicBlock *A, BinaryBasicBlock *B) { 305 return A->canOutline() < B->canOutline(); 306 }); 307 } 308 309 // Identify the last block that must not be split into a fragment. Every block 310 // after this block can be split. Note that when the iterator points to the 311 // block that cannot be outlined, then reverse_iterator::base() points to the 312 // block after it. 313 const BinaryFunction::BasicBlockOrderType::reverse_iterator FirstOutlineable = 314 llvm::find_if(reverse(NewLayout), [](const BinaryBasicBlock *const BB) { 315 return !BB->canOutline(); 316 }); 317 318 S.partition(FirstOutlineable.base(), NewLayout.end()); 319 BF.getLayout().update(NewLayout); 320 321 // For shared objects, invoke instructions and corresponding landing pads 322 // have to be placed in the same fragment. When we split them, create 323 // trampoline landing pads that will redirect the execution to real LPs. 324 TrampolineSetType Trampolines; 325 if (!BC.HasFixedLoadAddress && BF.hasEHRanges() && BF.isSplit()) 326 Trampolines = createEHTrampolines(BF); 327 328 // Check the new size to see if it's worth splitting the function. 329 if (BC.isX86() && BF.isSplit()) { 330 std::tie(HotSize, ColdSize) = BC.calculateEmittedSize(BF); 331 LLVM_DEBUG(dbgs() << "Estimated size for function " << BF 332 << " post-split is <0x" << Twine::utohexstr(HotSize) 333 << ", 0x" << Twine::utohexstr(ColdSize) << ">\n"); 334 if (alignTo(OriginalHotSize, opts::SplitAlignThreshold) <= 335 alignTo(HotSize, opts::SplitAlignThreshold) + opts::SplitThreshold) { 336 LLVM_DEBUG(dbgs() << "Reversing splitting of function " << BF << ":\n 0x" 337 << Twine::utohexstr(HotSize) << ", 0x" 338 << Twine::utohexstr(ColdSize) << " -> 0x" 339 << Twine::utohexstr(OriginalHotSize) << '\n'); 340 341 // Reverse the action of createEHTrampolines(). The trampolines will be 342 // placed immediately before the matching destination resulting in no 343 // extra code. 344 if (PreSplitLayout.size() != BF.size()) 345 PreSplitLayout = mergeEHTrampolines(BF, PreSplitLayout, Trampolines); 346 347 for (BinaryBasicBlock &BB : BF) 348 BB.setFragmentNum(FragmentNum::main()); 349 BF.getLayout().update(PreSplitLayout); 350 } else { 351 SplitBytesHot += HotSize; 352 SplitBytesCold += ColdSize; 353 } 354 } 355 } 356 357 SplitFunctions::TrampolineSetType 358 SplitFunctions::createEHTrampolines(BinaryFunction &BF) const { 359 const auto &MIB = BF.getBinaryContext().MIB; 360 361 // Map real landing pads to the corresponding trampolines. 362 TrampolineSetType LPTrampolines; 363 364 // Iterate over the copy of basic blocks since we are adding new blocks to the 365 // function which will invalidate its iterators. 366 std::vector<BinaryBasicBlock *> Blocks(BF.pbegin(), BF.pend()); 367 for (BinaryBasicBlock *BB : Blocks) { 368 for (MCInst &Instr : *BB) { 369 const Optional<MCPlus::MCLandingPad> EHInfo = MIB->getEHInfo(Instr); 370 if (!EHInfo || !EHInfo->first) 371 continue; 372 373 const MCSymbol *LPLabel = EHInfo->first; 374 BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(LPLabel); 375 if (BB->isCold() == LPBlock->isCold()) 376 continue; 377 378 const MCSymbol *TrampolineLabel = nullptr; 379 auto Iter = LPTrampolines.find(LPLabel); 380 if (Iter != LPTrampolines.end()) { 381 TrampolineLabel = Iter->second; 382 } else { 383 // Create a trampoline basic block in the same fragment as the thrower. 384 // Note: there's no need to insert the jump instruction, it will be 385 // added by fixBranches(). 386 BinaryBasicBlock *TrampolineBB = BF.addBasicBlock(); 387 TrampolineBB->setIsCold(BB->isCold()); 388 TrampolineBB->setExecutionCount(LPBlock->getExecutionCount()); 389 TrampolineBB->addSuccessor(LPBlock, TrampolineBB->getExecutionCount()); 390 TrampolineBB->setCFIState(LPBlock->getCFIState()); 391 TrampolineLabel = TrampolineBB->getLabel(); 392 LPTrampolines.insert(std::make_pair(LPLabel, TrampolineLabel)); 393 } 394 395 // Substitute the landing pad with the trampoline. 396 MIB->updateEHInfo(Instr, 397 MCPlus::MCLandingPad(TrampolineLabel, EHInfo->second)); 398 } 399 } 400 401 if (LPTrampolines.empty()) 402 return LPTrampolines; 403 404 // All trampoline blocks were added to the end of the function. Place them at 405 // the end of corresponding fragments. 406 BinaryFunction::BasicBlockOrderType NewLayout(BF.getLayout().block_begin(), 407 BF.getLayout().block_end()); 408 stable_sort(NewLayout, [&](BinaryBasicBlock *A, BinaryBasicBlock *B) { 409 return A->isCold() < B->isCold(); 410 }); 411 BF.getLayout().update(NewLayout); 412 413 // Conservatively introduce branch instructions. 414 BF.fixBranches(); 415 416 // Update exception-handling CFG for the function. 417 BF.recomputeLandingPads(); 418 419 return LPTrampolines; 420 } 421 422 SplitFunctions::BasicBlockOrderType SplitFunctions::mergeEHTrampolines( 423 BinaryFunction &BF, SplitFunctions::BasicBlockOrderType &Layout, 424 const SplitFunctions::TrampolineSetType &Trampolines) const { 425 BasicBlockOrderType MergedLayout; 426 for (BinaryBasicBlock *BB : Layout) { 427 auto Iter = Trampolines.find(BB->getLabel()); 428 if (Iter != Trampolines.end()) { 429 BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(Iter->second); 430 assert(LPBlock && "Could not find matching landing pad block."); 431 MergedLayout.push_back(LPBlock); 432 } 433 MergedLayout.push_back(BB); 434 } 435 436 return MergedLayout; 437 } 438 439 } // namespace bolt 440 } // namespace llvm 441