1 //===- bolt/Passes/SplitFunctions.cpp - Pass for splitting function code --===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the SplitFunctions pass. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "bolt/Passes/SplitFunctions.h" 14 #include "bolt/Core/BinaryBasicBlock.h" 15 #include "bolt/Core/BinaryFunction.h" 16 #include "bolt/Core/FunctionLayout.h" 17 #include "bolt/Core/ParallelUtilities.h" 18 #include "llvm/ADT/STLExtras.h" 19 #include "llvm/ADT/Sequence.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/ADT/iterator_range.h" 22 #include "llvm/Support/CommandLine.h" 23 #include "llvm/Support/FormatVariadic.h" 24 #include <algorithm> 25 #include <iterator> 26 #include <numeric> 27 #include <random> 28 #include <vector> 29 30 #define DEBUG_TYPE "bolt-opts" 31 32 using namespace llvm; 33 using namespace bolt; 34 35 namespace { 36 class DeprecatedSplitFunctionOptionParser : public cl::parser<bool> { 37 public: 38 explicit DeprecatedSplitFunctionOptionParser(cl::Option &O) 39 : cl::parser<bool>(O) {} 40 41 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, bool &Value) { 42 if (Arg == "2" || Arg == "3") { 43 Value = true; 44 errs() << formatv("BOLT-WARNING: specifying non-boolean value \"{0}\" " 45 "for option -{1} is deprecated\n", 46 Arg, ArgName); 47 return false; 48 } 49 return cl::parser<bool>::parse(O, ArgName, Arg, Value); 50 } 51 }; 52 } // namespace 53 54 namespace opts { 55 56 extern cl::OptionCategory BoltOptCategory; 57 58 extern cl::opt<bool> SplitEH; 59 extern cl::opt<unsigned> ExecutionCountThreshold; 60 extern cl::opt<uint32_t> RandomSeed; 61 62 static cl::opt<bool> AggressiveSplitting( 63 "split-all-cold", cl::desc("outline as many cold basic blocks as possible"), 64 cl::cat(BoltOptCategory)); 65 66 static cl::opt<unsigned> SplitAlignThreshold( 67 "split-align-threshold", 68 cl::desc("when deciding to split a function, apply this alignment " 69 "while doing the size comparison (see -split-threshold). " 70 "Default value: 2."), 71 cl::init(2), 72 73 cl::Hidden, cl::cat(BoltOptCategory)); 74 75 static cl::opt<bool, false, DeprecatedSplitFunctionOptionParser> 76 SplitFunctions("split-functions", 77 cl::desc("split functions into fragments"), 78 cl::cat(BoltOptCategory)); 79 80 static cl::opt<unsigned> SplitThreshold( 81 "split-threshold", 82 cl::desc("split function only if its main size is reduced by more than " 83 "given amount of bytes. Default value: 0, i.e. split iff the " 84 "size is reduced. Note that on some architectures the size can " 85 "increase after splitting."), 86 cl::init(0), cl::Hidden, cl::cat(BoltOptCategory)); 87 88 static cl::opt<SplitFunctionsStrategy> SplitStrategy( 89 "split-strategy", cl::init(SplitFunctionsStrategy::Profile2), 90 cl::values(clEnumValN(SplitFunctionsStrategy::Profile2, "profile2", 91 "split each function into a hot and cold fragment " 92 "using profiling information")), 93 cl::values(clEnumValN( 94 SplitFunctionsStrategy::Random2, "random2", 95 "split each function into a hot and cold fragment at a randomly chosen " 96 "split point (ignoring any available profiling information)")), 97 cl::values(clEnumValN( 98 SplitFunctionsStrategy::RandomN, "randomN", 99 "split each function into N fragments at a randomly chosen split " 100 "points (ignoring any available profiling information)")), 101 cl::values(clEnumValN( 102 SplitFunctionsStrategy::All, "all", 103 "split all basic blocks of each function into fragments such that each " 104 "fragment contains exactly a single basic block")), 105 cl::desc("strategy used to partition blocks into fragments"), 106 cl::cat(BoltOptCategory)); 107 } // namespace opts 108 109 namespace { 110 struct SplitProfile2 { 111 bool canSplit(const BinaryFunction &BF) { 112 if (!BF.hasValidProfile()) 113 return false; 114 115 bool AllCold = true; 116 for (const BinaryBasicBlock &BB : BF) { 117 const uint64_t ExecCount = BB.getExecutionCount(); 118 if (ExecCount == BinaryBasicBlock::COUNT_NO_PROFILE) 119 return false; 120 if (ExecCount != 0) 121 AllCold = false; 122 } 123 124 return !AllCold; 125 } 126 127 bool canOutline(const BinaryBasicBlock &BB) { 128 return BB.getExecutionCount() == 0; 129 } 130 131 template <typename It> void partition(const It Start, const It End) const { 132 std::for_each(Start, End, [](BinaryBasicBlock *const BB) { 133 assert(BB->canOutline() && 134 "Moving a block that is not outlineable to cold fragment"); 135 BB->setFragmentNum(FragmentNum::cold()); 136 }); 137 } 138 }; 139 140 struct SplitRandom2 { 141 std::minstd_rand0 *Gen; 142 143 explicit SplitRandom2(std::minstd_rand0 &Gen) : Gen(&Gen) {} 144 145 bool canSplit(const BinaryFunction &BF) { return true; } 146 bool canOutline(const BinaryBasicBlock &BB) { return true; } 147 148 template <typename It> void partition(It Start, It End) const { 149 using DiffT = typename std::iterator_traits<It>::difference_type; 150 const DiffT NumOutlineableBlocks = End - Start; 151 152 // We want to split at least one block unless there are no blocks that can 153 // be outlined 154 const auto MinimumSplit = std::min<DiffT>(NumOutlineableBlocks, 1); 155 std::uniform_int_distribution<DiffT> Dist(MinimumSplit, 156 NumOutlineableBlocks); 157 const DiffT NumColdBlocks = Dist(*Gen); 158 std::for_each(End - NumColdBlocks, End, [](BinaryBasicBlock *BB) { 159 BB->setFragmentNum(FragmentNum::cold()); 160 }); 161 162 LLVM_DEBUG(dbgs() << formatv("BOLT-DEBUG: randomly chose last {0} (out of " 163 "{1} possible) blocks to split\n", 164 NumColdBlocks, End - Start)); 165 } 166 }; 167 168 struct SplitRandomN { 169 std::minstd_rand0 *Gen; 170 171 explicit SplitRandomN(std::minstd_rand0 &Gen) : Gen(&Gen) {} 172 173 bool canSplit(const BinaryFunction &BF) { return true; } 174 bool canOutline(const BinaryBasicBlock &BB) { return true; } 175 176 template <typename It> void partition(It Start, It End) const { 177 using DiffT = typename std::iterator_traits<It>::difference_type; 178 const DiffT NumOutlineableBlocks = End - Start; 179 180 // We want to split at least one fragment if possible 181 const auto MinimumSplits = std::min<DiffT>(NumOutlineableBlocks, 1); 182 std::uniform_int_distribution<DiffT> Dist(MinimumSplits, 183 NumOutlineableBlocks); 184 // Choose how many splits to perform 185 const DiffT NumSplits = Dist(*Gen); 186 187 // Draw split points from a lottery 188 SmallVector<unsigned, 0> Lottery(NumOutlineableBlocks); 189 std::iota(Lottery.begin(), Lottery.end(), 0u); 190 std::shuffle(Lottery.begin(), Lottery.end(), *Gen); 191 Lottery.resize(NumSplits); 192 llvm::sort(Lottery); 193 194 // Add one past the end entry to lottery 195 Lottery.push_back(NumOutlineableBlocks); 196 197 unsigned LotteryIndex = 0; 198 unsigned BBPos = 0; 199 for (BinaryBasicBlock *const BB : make_range(Start, End)) { 200 // Check whether to start new fragment 201 if (BBPos >= Lottery[LotteryIndex]) 202 ++LotteryIndex; 203 204 // Because LotteryIndex is 0 based and cold fragments are 1 based, we can 205 // use the index to assign fragments. 206 BB->setFragmentNum(FragmentNum(LotteryIndex)); 207 208 ++BBPos; 209 } 210 } 211 }; 212 213 struct SplitAll { 214 bool canSplit(const BinaryFunction &BF) { return true; } 215 bool canOutline(const BinaryBasicBlock &BB) { return true; } 216 217 template <typename It> void partition(It Start, It End) const { 218 unsigned Fragment = 1; 219 std::for_each(Start, End, [&](BinaryBasicBlock *const BB) { 220 assert(BB->canOutline() && 221 "Moving a block that is not outlineable to cold fragment"); 222 BB->setFragmentNum(FragmentNum(Fragment++)); 223 }); 224 } 225 }; 226 } // namespace 227 228 namespace llvm { 229 namespace bolt { 230 231 bool SplitFunctions::shouldOptimize(const BinaryFunction &BF) const { 232 // Apply execution count threshold 233 if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold) 234 return false; 235 236 return BinaryFunctionPass::shouldOptimize(BF); 237 } 238 239 void SplitFunctions::runOnFunctions(BinaryContext &BC) { 240 if (!opts::SplitFunctions) 241 return; 242 243 std::minstd_rand0 RandGen(opts::RandomSeed.getValue()); 244 245 ParallelUtilities::WorkFuncTy WorkFun; 246 bool ForceSequential = false; 247 248 switch (opts::SplitStrategy) { 249 case SplitFunctionsStrategy::Profile2: 250 WorkFun = [&](BinaryFunction &BF) { splitFunction<SplitProfile2>(BF); }; 251 break; 252 case SplitFunctionsStrategy::Random2: 253 WorkFun = [&](BinaryFunction &BF) { 254 splitFunction(BF, SplitRandom2(RandGen)); 255 }; 256 // If we split functions randomly, we need to ensure that across runs with 257 // the same input, we generate random numbers for each function in the same 258 // order. 259 ForceSequential = true; 260 break; 261 case SplitFunctionsStrategy::RandomN: 262 WorkFun = [&](BinaryFunction &BF) { 263 splitFunction(BF, SplitRandomN(RandGen)); 264 }; 265 ForceSequential = true; 266 break; 267 case SplitFunctionsStrategy::All: 268 WorkFun = [&](BinaryFunction &BF) { splitFunction<SplitAll>(BF); }; 269 break; 270 } 271 272 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 273 return !shouldOptimize(BF); 274 }; 275 276 ParallelUtilities::runOnEachFunction( 277 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, WorkFun, SkipFunc, 278 "SplitFunctions", ForceSequential); 279 280 if (SplitBytesHot + SplitBytesCold > 0) 281 outs() << "BOLT-INFO: splitting separates " << SplitBytesHot 282 << " hot bytes from " << SplitBytesCold << " cold bytes " 283 << format("(%.2lf%% of split functions is hot).\n", 284 100.0 * SplitBytesHot / (SplitBytesHot + SplitBytesCold)); 285 } 286 287 template <typename Strategy> 288 void SplitFunctions::splitFunction(BinaryFunction &BF, Strategy S) { 289 if (BF.empty()) 290 return; 291 292 if (!S.canSplit(BF)) 293 return; 294 295 FunctionLayout &Layout = BF.getLayout(); 296 BinaryFunction::BasicBlockOrderType PreSplitLayout(Layout.block_begin(), 297 Layout.block_end()); 298 299 BinaryContext &BC = BF.getBinaryContext(); 300 size_t OriginalHotSize; 301 size_t HotSize; 302 size_t ColdSize; 303 if (BC.isX86()) { 304 std::tie(OriginalHotSize, ColdSize) = BC.calculateEmittedSize(BF); 305 LLVM_DEBUG(dbgs() << "Estimated size for function " << BF 306 << " pre-split is <0x" 307 << Twine::utohexstr(OriginalHotSize) << ", 0x" 308 << Twine::utohexstr(ColdSize) << ">\n"); 309 } 310 311 BinaryFunction::BasicBlockOrderType NewLayout(Layout.block_begin(), 312 Layout.block_end()); 313 // Never outline the first basic block. 314 NewLayout.front()->setCanOutline(false); 315 for (BinaryBasicBlock *const BB : NewLayout) { 316 if (!BB->canOutline()) 317 continue; 318 if (!S.canOutline(*BB)) { 319 BB->setCanOutline(false); 320 continue; 321 } 322 // Do not split extra entry points in aarch64. They can be referred by 323 // using ADRs and when this happens, these blocks cannot be placed far 324 // away due to the limited range in ADR instruction. 325 if (BC.isAArch64() && BB->isEntryPoint()) { 326 BB->setCanOutline(false); 327 continue; 328 } 329 330 if (BF.hasEHRanges() && !opts::SplitEH) { 331 // We cannot move landing pads (or rather entry points for landing pads). 332 if (BB->isLandingPad()) { 333 BB->setCanOutline(false); 334 continue; 335 } 336 // We cannot move a block that can throw since exception-handling 337 // runtime cannot deal with split functions. However, if we can guarantee 338 // that the block never throws, it is safe to move the block to 339 // decrease the size of the function. 340 for (MCInst &Instr : *BB) { 341 if (BC.MIB->isInvoke(Instr)) { 342 BB->setCanOutline(false); 343 break; 344 } 345 } 346 } 347 } 348 349 if (opts::AggressiveSplitting) { 350 // All blocks with 0 count that we can move go to the end of the function. 351 // Even if they were natural to cluster formation and were seen in-between 352 // hot basic blocks. 353 stable_sort(NewLayout, [&](BinaryBasicBlock *A, BinaryBasicBlock *B) { 354 return A->canOutline() < B->canOutline(); 355 }); 356 } else if (BF.hasEHRanges() && !opts::SplitEH) { 357 // Typically functions with exception handling have landing pads at the end. 358 // We cannot move beginning of landing pads, but we can move 0-count blocks 359 // comprising landing pads to the end and thus facilitate splitting. 360 auto FirstLP = NewLayout.begin(); 361 while ((*FirstLP)->isLandingPad()) 362 ++FirstLP; 363 364 std::stable_sort(FirstLP, NewLayout.end(), 365 [&](BinaryBasicBlock *A, BinaryBasicBlock *B) { 366 return A->canOutline() < B->canOutline(); 367 }); 368 } 369 370 // Identify the last block that must not be split into a fragment. Every block 371 // after this block can be split. Note that when the iterator points to the 372 // block that cannot be outlined, then reverse_iterator::base() points to the 373 // block after it. 374 const BinaryFunction::BasicBlockOrderType::reverse_iterator FirstOutlineable = 375 llvm::find_if(reverse(NewLayout), [](const BinaryBasicBlock *const BB) { 376 return !BB->canOutline(); 377 }); 378 379 S.partition(FirstOutlineable.base(), NewLayout.end()); 380 BF.getLayout().update(NewLayout); 381 382 // For shared objects, invoke instructions and corresponding landing pads 383 // have to be placed in the same fragment. When we split them, create 384 // trampoline landing pads that will redirect the execution to real LPs. 385 TrampolineSetType Trampolines; 386 if (!BC.HasFixedLoadAddress && BF.hasEHRanges() && BF.isSplit()) 387 Trampolines = createEHTrampolines(BF); 388 389 // Check the new size to see if it's worth splitting the function. 390 if (BC.isX86() && BF.isSplit()) { 391 std::tie(HotSize, ColdSize) = BC.calculateEmittedSize(BF); 392 LLVM_DEBUG(dbgs() << "Estimated size for function " << BF 393 << " post-split is <0x" << Twine::utohexstr(HotSize) 394 << ", 0x" << Twine::utohexstr(ColdSize) << ">\n"); 395 if (alignTo(OriginalHotSize, opts::SplitAlignThreshold) <= 396 alignTo(HotSize, opts::SplitAlignThreshold) + opts::SplitThreshold) { 397 LLVM_DEBUG(dbgs() << "Reversing splitting of function " << BF << ":\n 0x" 398 << Twine::utohexstr(HotSize) << ", 0x" 399 << Twine::utohexstr(ColdSize) << " -> 0x" 400 << Twine::utohexstr(OriginalHotSize) << '\n'); 401 402 // Reverse the action of createEHTrampolines(). The trampolines will be 403 // placed immediately before the matching destination resulting in no 404 // extra code. 405 if (PreSplitLayout.size() != BF.size()) 406 PreSplitLayout = mergeEHTrampolines(BF, PreSplitLayout, Trampolines); 407 408 for (BinaryBasicBlock &BB : BF) 409 BB.setFragmentNum(FragmentNum::main()); 410 BF.getLayout().update(PreSplitLayout); 411 } else { 412 SplitBytesHot += HotSize; 413 SplitBytesCold += ColdSize; 414 } 415 } 416 } 417 418 SplitFunctions::TrampolineSetType 419 SplitFunctions::createEHTrampolines(BinaryFunction &BF) const { 420 const auto &MIB = BF.getBinaryContext().MIB; 421 422 // Map real landing pads to the corresponding trampolines. 423 TrampolineSetType LPTrampolines; 424 425 // Iterate over the copy of basic blocks since we are adding new blocks to the 426 // function which will invalidate its iterators. 427 std::vector<BinaryBasicBlock *> Blocks(BF.pbegin(), BF.pend()); 428 for (BinaryBasicBlock *BB : Blocks) { 429 for (MCInst &Instr : *BB) { 430 const Optional<MCPlus::MCLandingPad> EHInfo = MIB->getEHInfo(Instr); 431 if (!EHInfo || !EHInfo->first) 432 continue; 433 434 const MCSymbol *LPLabel = EHInfo->first; 435 BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(LPLabel); 436 if (BB->getFragmentNum() == LPBlock->getFragmentNum()) 437 continue; 438 439 const MCSymbol *TrampolineLabel = nullptr; 440 const TrampolineKey Key(BB->getFragmentNum(), LPLabel); 441 auto Iter = LPTrampolines.find(Key); 442 if (Iter != LPTrampolines.end()) { 443 TrampolineLabel = Iter->second; 444 } else { 445 // Create a trampoline basic block in the same fragment as the thrower. 446 // Note: there's no need to insert the jump instruction, it will be 447 // added by fixBranches(). 448 BinaryBasicBlock *TrampolineBB = BF.addBasicBlock(); 449 TrampolineBB->setFragmentNum(BB->getFragmentNum()); 450 TrampolineBB->setExecutionCount(LPBlock->getExecutionCount()); 451 TrampolineBB->addSuccessor(LPBlock, TrampolineBB->getExecutionCount()); 452 TrampolineBB->setCFIState(LPBlock->getCFIState()); 453 TrampolineLabel = TrampolineBB->getLabel(); 454 LPTrampolines.insert(std::make_pair(Key, TrampolineLabel)); 455 } 456 457 // Substitute the landing pad with the trampoline. 458 MIB->updateEHInfo(Instr, 459 MCPlus::MCLandingPad(TrampolineLabel, EHInfo->second)); 460 } 461 } 462 463 if (LPTrampolines.empty()) 464 return LPTrampolines; 465 466 // All trampoline blocks were added to the end of the function. Place them at 467 // the end of corresponding fragments. 468 BinaryFunction::BasicBlockOrderType NewLayout(BF.getLayout().block_begin(), 469 BF.getLayout().block_end()); 470 stable_sort(NewLayout, [&](BinaryBasicBlock *A, BinaryBasicBlock *B) { 471 return A->getFragmentNum() < B->getFragmentNum(); 472 }); 473 BF.getLayout().update(NewLayout); 474 475 // Conservatively introduce branch instructions. 476 BF.fixBranches(); 477 478 // Update exception-handling CFG for the function. 479 BF.recomputeLandingPads(); 480 481 return LPTrampolines; 482 } 483 484 SplitFunctions::BasicBlockOrderType SplitFunctions::mergeEHTrampolines( 485 BinaryFunction &BF, SplitFunctions::BasicBlockOrderType &Layout, 486 const SplitFunctions::TrampolineSetType &Trampolines) const { 487 DenseMap<const MCSymbol *, SmallVector<const MCSymbol *, 0>> 488 IncomingTrampolines; 489 for (const auto &Entry : Trampolines) { 490 IncomingTrampolines[Entry.getFirst().Target].emplace_back( 491 Entry.getSecond()); 492 } 493 494 BasicBlockOrderType MergedLayout; 495 for (BinaryBasicBlock *BB : Layout) { 496 auto Iter = IncomingTrampolines.find(BB->getLabel()); 497 if (Iter != IncomingTrampolines.end()) { 498 for (const MCSymbol *const Trampoline : Iter->getSecond()) { 499 BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(Trampoline); 500 assert(LPBlock && "Could not find matching landing pad block."); 501 MergedLayout.push_back(LPBlock); 502 } 503 } 504 MergedLayout.push_back(BB); 505 } 506 507 return MergedLayout; 508 } 509 510 } // namespace bolt 511 } // namespace llvm 512