1 //===- bolt/Passes/SplitFunctions.cpp - Pass for splitting function code --===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the SplitFunctions pass. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "bolt/Passes/SplitFunctions.h" 14 #include "bolt/Core/BinaryBasicBlock.h" 15 #include "bolt/Core/BinaryFunction.h" 16 #include "bolt/Core/FunctionLayout.h" 17 #include "bolt/Core/ParallelUtilities.h" 18 #include "bolt/Utils/CommandLineOpts.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/ADT/iterator_range.h" 22 #include "llvm/Support/CommandLine.h" 23 #include "llvm/Support/FormatVariadic.h" 24 #include <algorithm> 25 #include <iterator> 26 #include <memory> 27 #include <numeric> 28 #include <random> 29 #include <vector> 30 31 #define DEBUG_TYPE "bolt-opts" 32 33 using namespace llvm; 34 using namespace bolt; 35 36 namespace { 37 class DeprecatedSplitFunctionOptionParser : public cl::parser<bool> { 38 public: 39 explicit DeprecatedSplitFunctionOptionParser(cl::Option &O) 40 : cl::parser<bool>(O) {} 41 42 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, bool &Value) { 43 if (Arg == "2" || Arg == "3") { 44 Value = true; 45 errs() << formatv("BOLT-WARNING: specifying non-boolean value \"{0}\" " 46 "for option -{1} is deprecated\n", 47 Arg, ArgName); 48 return false; 49 } 50 return cl::parser<bool>::parse(O, ArgName, Arg, Value); 51 } 52 }; 53 } // namespace 54 55 namespace opts { 56 57 extern cl::OptionCategory BoltOptCategory; 58 59 extern cl::opt<bool> SplitEH; 60 extern cl::opt<unsigned> ExecutionCountThreshold; 61 extern cl::opt<uint32_t> RandomSeed; 62 63 static cl::opt<bool> AggressiveSplitting( 64 "split-all-cold", cl::desc("outline as many cold basic blocks as possible"), 65 cl::cat(BoltOptCategory)); 66 67 static cl::opt<unsigned> SplitAlignThreshold( 68 "split-align-threshold", 69 cl::desc("when deciding to split a function, apply this alignment " 70 "while doing the size comparison (see -split-threshold). " 71 "Default value: 2."), 72 cl::init(2), 73 74 cl::Hidden, cl::cat(BoltOptCategory)); 75 76 static cl::opt<bool, false, DeprecatedSplitFunctionOptionParser> 77 SplitFunctions("split-functions", 78 cl::desc("split functions into fragments"), 79 cl::cat(BoltOptCategory)); 80 81 static cl::opt<unsigned> SplitThreshold( 82 "split-threshold", 83 cl::desc("split function only if its main size is reduced by more than " 84 "given amount of bytes. Default value: 0, i.e. split iff the " 85 "size is reduced. Note that on some architectures the size can " 86 "increase after splitting."), 87 cl::init(0), cl::Hidden, cl::cat(BoltOptCategory)); 88 89 static cl::opt<SplitFunctionsStrategy> SplitStrategy( 90 "split-strategy", cl::init(SplitFunctionsStrategy::Profile2), 91 cl::values(clEnumValN(SplitFunctionsStrategy::Profile2, "profile2", 92 "split each function into a hot and cold fragment " 93 "using profiling information")), 94 cl::values(clEnumValN(SplitFunctionsStrategy::CDSplit, "cdsplit", 95 "split each function into a hot, warm, and cold " 96 "fragment using profiling information")), 97 cl::values(clEnumValN( 98 SplitFunctionsStrategy::Random2, "random2", 99 "split each function into a hot and cold fragment at a randomly chosen " 100 "split point (ignoring any available profiling information)")), 101 cl::values(clEnumValN( 102 SplitFunctionsStrategy::RandomN, "randomN", 103 "split each function into N fragments at a randomly chosen split " 104 "points (ignoring any available profiling information)")), 105 cl::values(clEnumValN( 106 SplitFunctionsStrategy::All, "all", 107 "split all basic blocks of each function into fragments such that each " 108 "fragment contains exactly a single basic block")), 109 cl::desc("strategy used to partition blocks into fragments"), 110 cl::cat(BoltOptCategory)); 111 112 static cl::opt<double> CallScale( 113 "call-scale", 114 cl::desc("Call score scale coefficient (when --split-strategy=cdsplit)"), 115 cl::init(0.95), cl::ReallyHidden, cl::cat(BoltOptCategory)); 116 117 static cl::opt<double> 118 CallPower("call-power", 119 cl::desc("Call score power (when --split-strategy=cdsplit)"), 120 cl::init(0.05), cl::ReallyHidden, cl::cat(BoltOptCategory)); 121 122 static cl::opt<double> 123 JumpPower("jump-power", 124 cl::desc("Jump score power (when --split-strategy=cdsplit)"), 125 cl::init(0.15), cl::ReallyHidden, cl::cat(BoltOptCategory)); 126 } // namespace opts 127 128 namespace { 129 bool hasFullProfile(const BinaryFunction &BF) { 130 return llvm::all_of(BF.blocks(), [](const BinaryBasicBlock &BB) { 131 return BB.getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE; 132 }); 133 } 134 135 bool allBlocksCold(const BinaryFunction &BF) { 136 return llvm::all_of(BF.blocks(), [](const BinaryBasicBlock &BB) { 137 return BB.getExecutionCount() == 0; 138 }); 139 } 140 141 struct SplitProfile2 final : public SplitStrategy { 142 bool canSplit(const BinaryFunction &BF) override { 143 return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF); 144 } 145 146 bool compactFragments() override { return true; } 147 148 void fragment(const BlockIt Start, const BlockIt End) override { 149 for (BinaryBasicBlock *const BB : llvm::make_range(Start, End)) { 150 if (BB->getExecutionCount() == 0) 151 BB->setFragmentNum(FragmentNum::cold()); 152 } 153 } 154 }; 155 156 struct SplitCacheDirected final : public SplitStrategy { 157 BinaryContext &BC; 158 using BasicBlockOrder = BinaryFunction::BasicBlockOrderType; 159 160 bool canSplit(const BinaryFunction &BF) override { 161 return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF); 162 } 163 164 explicit SplitCacheDirected(BinaryContext &BC) : BC(BC) { 165 initializeAuxiliaryVariables(); 166 buildCallGraph(); 167 } 168 169 // When some functions are hot-warm split and others are hot-warm-cold split, 170 // we do not want to change the fragment numbers of the blocks in the hot-warm 171 // split functions. 172 bool compactFragments() override { return false; } 173 174 void fragment(const BlockIt Start, const BlockIt End) override { 175 BasicBlockOrder BlockOrder(Start, End); 176 BinaryFunction &BF = *BlockOrder.front()->getFunction(); 177 // No need to re-split small functions. 178 if (BlockOrder.size() <= 2) 179 return; 180 181 size_t BestSplitIndex = findSplitIndex(BF, BlockOrder); 182 assert(BestSplitIndex < BlockOrder.size()); 183 184 // Assign fragments based on the computed best split index. 185 // All basic blocks with index up to the best split index become hot. 186 // All remaining blocks are warm / cold depending on if count is 187 // greater than zero or not. 188 for (size_t Index = 0; Index < BlockOrder.size(); Index++) { 189 BinaryBasicBlock *BB = BlockOrder[Index]; 190 if (Index <= BestSplitIndex) 191 BB->setFragmentNum(FragmentNum::main()); 192 else 193 BB->setFragmentNum(BB->getKnownExecutionCount() > 0 194 ? FragmentNum::warm() 195 : FragmentNum::cold()); 196 } 197 } 198 199 private: 200 struct CallInfo { 201 size_t Length; 202 size_t Count; 203 }; 204 205 struct SplitScore { 206 size_t SplitIndex = size_t(-1); 207 size_t HotSizeReduction = 0; 208 double LocalScore = 0; 209 double CoverCallScore = 0; 210 211 double sum() const { return LocalScore + CoverCallScore; } 212 }; 213 214 // Auxiliary variables used by the algorithm. 215 size_t TotalNumBlocks{0}; 216 size_t OrigHotSectionSize{0}; 217 DenseMap<const BinaryBasicBlock *, size_t> GlobalIndices; 218 DenseMap<const BinaryBasicBlock *, size_t> BBSizes; 219 DenseMap<const BinaryBasicBlock *, size_t> BBOffsets; 220 221 // Call graph. 222 std::vector<SmallVector<const BinaryBasicBlock *, 0>> Callers; 223 std::vector<SmallVector<const BinaryBasicBlock *, 0>> Callees; 224 225 bool shouldConsiderForCallGraph(const BinaryFunction &BF) { 226 // Only a subset of the functions in the binary will be considered 227 // for initializing auxiliary variables and building call graph. 228 return BF.hasValidIndex() && BF.hasValidProfile() && !BF.empty(); 229 } 230 231 void initializeAuxiliaryVariables() { 232 for (BinaryFunction *BF : BC.getSortedFunctions()) { 233 if (!shouldConsiderForCallGraph(*BF)) 234 continue; 235 236 // Calculate the size of each BB after hot-cold splitting. 237 // This populates BinaryBasicBlock::OutputAddressRange which 238 // can be used to compute the size of each BB. 239 BC.calculateEmittedSize(*BF, /*FixBranches=*/true); 240 241 for (BinaryBasicBlock *BB : BF->getLayout().blocks()) { 242 // Unique global index. 243 GlobalIndices[BB] = TotalNumBlocks; 244 TotalNumBlocks++; 245 246 // Block size after hot-cold splitting. 247 BBSizes[BB] = BB->getOutputSize(); 248 249 // Hot block offset after hot-cold splitting. 250 BBOffsets[BB] = OrigHotSectionSize; 251 if (!BB->isSplit()) 252 OrigHotSectionSize += BBSizes[BB]; 253 } 254 } 255 } 256 257 void buildCallGraph() { 258 Callers.resize(TotalNumBlocks); 259 Callees.resize(TotalNumBlocks); 260 for (const BinaryFunction *SrcFunction : BC.getSortedFunctions()) { 261 if (!shouldConsiderForCallGraph(*SrcFunction)) 262 continue; 263 264 for (BinaryBasicBlock &SrcBB : SrcFunction->blocks()) { 265 // Skip blocks that are not executed 266 if (SrcBB.getKnownExecutionCount() == 0) 267 continue; 268 269 // Find call instructions and extract target symbols from each one 270 for (const MCInst &Inst : SrcBB) { 271 if (!BC.MIB->isCall(Inst)) 272 continue; 273 274 // Call info 275 const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst); 276 // Ignore calls w/o information 277 if (!DstSym) 278 continue; 279 280 const BinaryFunction *DstFunction = BC.getFunctionForSymbol(DstSym); 281 // Ignore calls that do not have a valid target, but do not ignore 282 // recursive calls, because caller block could be moved to warm. 283 if (!DstFunction || DstFunction->getLayout().block_empty()) 284 continue; 285 286 const BinaryBasicBlock *DstBB = &(DstFunction->front()); 287 288 // Record the call only if DstBB is also in functions to consider for 289 // call graph. 290 if (GlobalIndices.contains(DstBB)) { 291 Callers[GlobalIndices[DstBB]].push_back(&SrcBB); 292 Callees[GlobalIndices[&SrcBB]].push_back(DstBB); 293 } 294 } 295 } 296 } 297 } 298 299 /// Populate BinaryBasicBlock::OutputAddressRange with estimated basic block 300 /// start and end addresses for hot and warm basic blocks, assuming hot-warm 301 /// splitting happens at \p SplitIndex. Also return estimated end addresses 302 /// of the hot fragment before and after splitting. 303 /// The estimations take into account the potential addition of branch 304 /// instructions due to split fall through branches as well as the need to 305 /// use longer branch instructions for split (un)conditional branches. 306 std::pair<size_t, size_t> 307 estimatePostSplitBBAddress(const BasicBlockOrder &BlockOrder, 308 const size_t SplitIndex) { 309 assert(SplitIndex < BlockOrder.size() && "Invalid split index"); 310 311 // Update function layout assuming hot-warm splitting at SplitIndex. 312 for (size_t Index = 0; Index < BlockOrder.size(); Index++) { 313 BinaryBasicBlock *BB = BlockOrder[Index]; 314 if (BB->getFragmentNum() == FragmentNum::cold()) 315 break; 316 BB->setFragmentNum(Index <= SplitIndex ? FragmentNum::main() 317 : FragmentNum::warm()); 318 } 319 BinaryFunction *BF = BlockOrder[0]->getFunction(); 320 BF->getLayout().update(BlockOrder); 321 // Populate BB.OutputAddressRange under the updated layout. 322 BC.calculateEmittedSize(*BF); 323 324 // Populate BB.OutputAddressRange with estimated new start and end addresses 325 // and compute the old end address of the hot section and the new end 326 // address of the hot section. 327 size_t OldHotEndAddr{0}; 328 size_t NewHotEndAddr{0}; 329 size_t CurrentAddr = BBOffsets[BlockOrder[0]]; 330 for (BinaryBasicBlock *BB : BlockOrder) { 331 // We only care about new addresses of blocks in hot/warm. 332 if (BB->getFragmentNum() == FragmentNum::cold()) 333 break; 334 const size_t NewSize = BB->getOutputSize(); 335 BB->setOutputStartAddress(CurrentAddr); 336 CurrentAddr += NewSize; 337 BB->setOutputEndAddress(CurrentAddr); 338 if (BB->getLayoutIndex() == SplitIndex) { 339 NewHotEndAddr = CurrentAddr; 340 // Approximate the start address of the warm fragment of the current 341 // function using the original hot section size. 342 CurrentAddr = OrigHotSectionSize; 343 } 344 OldHotEndAddr = BBOffsets[BB] + BBSizes[BB]; 345 } 346 return std::make_pair(OldHotEndAddr, NewHotEndAddr); 347 } 348 349 /// Get a collection of "shortenable" calls, that is, calls of type X->Y 350 /// when the function order is [... X ... BF ... Y ...]. 351 /// If the hot fragment size of BF is reduced, then such calls are guaranteed 352 /// to get shorter by the reduced hot fragment size. 353 std::vector<CallInfo> extractCoverCalls(const BinaryFunction &BF) { 354 // Record the length and the count of the calls that can be shortened 355 std::vector<CallInfo> CoverCalls; 356 if (opts::CallScale == 0) 357 return CoverCalls; 358 359 const BinaryFunction *ThisBF = &BF; 360 const BinaryBasicBlock *ThisBB = &(ThisBF->front()); 361 const size_t ThisGI = GlobalIndices[ThisBB]; 362 363 for (const BinaryFunction *DstBF : BC.getSortedFunctions()) { 364 if (!shouldConsiderForCallGraph(*DstBF)) 365 continue; 366 367 const BinaryBasicBlock *DstBB = &(DstBF->front()); 368 if (DstBB->getKnownExecutionCount() == 0) 369 continue; 370 371 const size_t DstGI = GlobalIndices[DstBB]; 372 for (const BinaryBasicBlock *SrcBB : Callers[DstGI]) { 373 const BinaryFunction *SrcBF = SrcBB->getFunction(); 374 if (ThisBF == SrcBF) 375 continue; 376 377 const size_t CallCount = SrcBB->getKnownExecutionCount(); 378 379 const size_t SrcGI = GlobalIndices[SrcBB]; 380 381 const bool IsCoverCall = (SrcGI < ThisGI && ThisGI < DstGI) || 382 (DstGI <= ThisGI && ThisGI < SrcGI); 383 if (!IsCoverCall) 384 continue; 385 386 const size_t SrcBBEndAddr = BBOffsets[SrcBB] + BBSizes[SrcBB]; 387 const size_t DstBBStartAddr = BBOffsets[DstBB]; 388 const size_t CallLength = 389 AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr); 390 const CallInfo CI{CallLength, CallCount}; 391 CoverCalls.emplace_back(CI); 392 } 393 } 394 return CoverCalls; 395 } 396 397 /// Compute the edge score of a call edge. 398 double computeCallScore(uint64_t CallCount, size_t CallLength) { 399 // Increase call lengths by 1 to avoid raising 0 to a negative power. 400 return opts::CallScale * static_cast<double>(CallCount) / 401 std::pow(static_cast<double>(CallLength + 1), opts::CallPower); 402 } 403 404 /// Compute the edge score of a jump (branch) edge. 405 double computeJumpScore(uint64_t JumpCount, size_t JumpLength) { 406 // Increase jump lengths by 1 to avoid raising 0 to a negative power. 407 return static_cast<double>(JumpCount) / 408 std::pow(static_cast<double>(JumpLength + 1), opts::JumpPower); 409 } 410 411 /// Compute sum of scores over jumps within \p BlockOrder given \p SplitIndex. 412 /// Increament Score.LocalScore in place by the sum. 413 void computeJumpScore(const BasicBlockOrder &BlockOrder, 414 const size_t SplitIndex, SplitScore &Score) { 415 416 for (const BinaryBasicBlock *SrcBB : BlockOrder) { 417 if (SrcBB->getKnownExecutionCount() == 0) 418 continue; 419 420 const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second; 421 422 for (const auto Pair : zip(SrcBB->successors(), SrcBB->branch_info())) { 423 const BinaryBasicBlock *DstBB = std::get<0>(Pair); 424 const BinaryBasicBlock::BinaryBranchInfo &Branch = std::get<1>(Pair); 425 const size_t JumpCount = Branch.Count; 426 427 if (JumpCount == 0) 428 continue; 429 430 const size_t DstBBStartAddr = DstBB->getOutputAddressRange().first; 431 const size_t NewJumpLength = 432 AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr); 433 Score.LocalScore += computeJumpScore(JumpCount, NewJumpLength); 434 } 435 } 436 } 437 438 /// Compute sum of scores over calls originated in the current function 439 /// given \p SplitIndex. Increament Score.LocalScore in place by the sum. 440 void computeLocalCallScore(const BasicBlockOrder &BlockOrder, 441 const size_t SplitIndex, SplitScore &Score) { 442 if (opts::CallScale == 0) 443 return; 444 445 // Global index of the last block in the current function. 446 // This is later used to determine whether a call originated in the current 447 // function is to a function that comes after the current function. 448 const size_t LastGlobalIndex = GlobalIndices[BlockOrder.back()]; 449 450 // The length of calls originated in the input function can increase / 451 // decrease depending on the splitting decision. 452 for (const BinaryBasicBlock *SrcBB : BlockOrder) { 453 const size_t CallCount = SrcBB->getKnownExecutionCount(); 454 // If SrcBB does not call any functions, skip it. 455 if (CallCount == 0) 456 continue; 457 458 // Obtain an estimate on the end address of the src basic block 459 // after splitting at SplitIndex. 460 const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second; 461 462 for (const BinaryBasicBlock *DstBB : Callees[GlobalIndices[SrcBB]]) { 463 // Obtain an estimate on the start address of the dst basic block 464 // after splitting at SplitIndex. If DstBB is in a function before 465 // the current function, then its start address remains unchanged. 466 size_t DstBBStartAddr = BBOffsets[DstBB]; 467 // If DstBB is in a function after the current function, then its 468 // start address should be adjusted based on the reduction in hot size. 469 if (GlobalIndices[DstBB] > LastGlobalIndex) { 470 assert(DstBBStartAddr >= Score.HotSizeReduction); 471 DstBBStartAddr -= Score.HotSizeReduction; 472 } 473 const size_t NewCallLength = 474 AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr); 475 Score.LocalScore += computeCallScore(CallCount, NewCallLength); 476 } 477 } 478 } 479 480 /// Compute sum of splitting scores for cover calls of the input function. 481 /// Increament Score.CoverCallScore in place by the sum. 482 void computeCoverCallScore(const BasicBlockOrder &BlockOrder, 483 const size_t SplitIndex, 484 const std::vector<CallInfo> &CoverCalls, 485 SplitScore &Score) { 486 if (opts::CallScale == 0) 487 return; 488 489 for (const CallInfo CI : CoverCalls) { 490 assert(CI.Length >= Score.HotSizeReduction && 491 "Length of cover calls must exceed reduced size of hot fragment."); 492 // Compute the new length of the call, which is shorter than the original 493 // one by the size of the splitted fragment minus the total size increase. 494 const size_t NewCallLength = CI.Length - Score.HotSizeReduction; 495 Score.CoverCallScore += computeCallScore(CI.Count, NewCallLength); 496 } 497 } 498 499 /// Compute the split score of splitting a function at a given index. 500 /// The split score consists of local score and cover score. This function 501 /// returns \p Score of SplitScore type. It contains the local score and 502 /// cover score of the current splitting index. For easier book keeping and 503 /// comparison, it also stores the split index and the resulting reduction 504 /// in hot fragment size. 505 SplitScore computeSplitScore(const BinaryFunction &BF, 506 const BasicBlockOrder &BlockOrder, 507 const size_t SplitIndex, 508 const std::vector<CallInfo> &CoverCalls) { 509 // Populate BinaryBasicBlock::OutputAddressRange with estimated 510 // new start and end addresses after hot-warm splitting at SplitIndex. 511 size_t OldHotEnd; 512 size_t NewHotEnd; 513 std::tie(OldHotEnd, NewHotEnd) = 514 estimatePostSplitBBAddress(BlockOrder, SplitIndex); 515 516 SplitScore Score; 517 Score.SplitIndex = SplitIndex; 518 519 // It's not worth splitting if OldHotEnd < NewHotEnd. 520 if (OldHotEnd < NewHotEnd) 521 return Score; 522 523 // Hot fragment size reduction due to splitting. 524 Score.HotSizeReduction = OldHotEnd - NewHotEnd; 525 526 // First part of LocalScore is the sum over call edges originated in the 527 // input function. These edges can get shorter or longer depending on 528 // SplitIndex. Score.LocalScore is increamented in place. 529 computeLocalCallScore(BlockOrder, SplitIndex, Score); 530 531 // Second part of LocalScore is the sum over jump edges with src basic block 532 // and dst basic block in the current function. Score.LocalScore is 533 // increamented in place. 534 computeJumpScore(BlockOrder, SplitIndex, Score); 535 536 // Compute CoverCallScore and store in Score in place. 537 computeCoverCallScore(BlockOrder, SplitIndex, CoverCalls, Score); 538 return Score; 539 } 540 541 /// Find the most likely successor of a basic block when it has one or two 542 /// successors. Return nullptr otherwise. 543 const BinaryBasicBlock *getMostLikelySuccessor(const BinaryBasicBlock *BB) { 544 if (BB->succ_size() == 1) 545 return BB->getSuccessor(); 546 if (BB->succ_size() == 2) { 547 uint64_t TakenCount = BB->getTakenBranchInfo().Count; 548 assert(TakenCount != BinaryBasicBlock::COUNT_NO_PROFILE); 549 uint64_t NonTakenCount = BB->getFallthroughBranchInfo().Count; 550 assert(NonTakenCount != BinaryBasicBlock::COUNT_NO_PROFILE); 551 if (TakenCount > NonTakenCount) 552 return BB->getConditionalSuccessor(true); 553 else if (TakenCount < NonTakenCount) 554 return BB->getConditionalSuccessor(false); 555 } 556 return nullptr; 557 } 558 559 /// Find the best index for splitting. The returned value is the index of the 560 /// last hot basic block. Hence, "no splitting" is equivalent to returning the 561 /// value which is one less than the size of the function. 562 size_t findSplitIndex(const BinaryFunction &BF, 563 const BasicBlockOrder &BlockOrder) { 564 assert(BlockOrder.size() > 2); 565 // Find all function calls that can be shortened if we move blocks of the 566 // current function to warm/cold 567 const std::vector<CallInfo> CoverCalls = extractCoverCalls(BF); 568 569 // Find the existing hot-cold splitting index. 570 size_t HotColdIndex = 0; 571 while (HotColdIndex + 1 < BlockOrder.size()) { 572 if (BlockOrder[HotColdIndex + 1]->getFragmentNum() == FragmentNum::cold()) 573 break; 574 HotColdIndex++; 575 } 576 assert(HotColdIndex + 1 == BlockOrder.size() || 577 (BlockOrder[HotColdIndex]->getFragmentNum() == FragmentNum::main() && 578 BlockOrder[HotColdIndex + 1]->getFragmentNum() == 579 FragmentNum::cold())); 580 581 // Try all possible split indices up to HotColdIndex (blocks that have 582 // Index <= SplitIndex are in hot) and find the one maximizing the 583 // splitting score. 584 SplitScore BestScore; 585 for (size_t Index = 0; Index <= HotColdIndex; Index++) { 586 const BinaryBasicBlock *LastHotBB = BlockOrder[Index]; 587 assert(LastHotBB->getFragmentNum() != FragmentNum::cold()); 588 589 // Do not break jump to the most likely successor. 590 if (Index + 1 < BlockOrder.size() && 591 BlockOrder[Index + 1] == getMostLikelySuccessor(LastHotBB)) 592 continue; 593 594 const SplitScore Score = 595 computeSplitScore(BF, BlockOrder, Index, CoverCalls); 596 if (Score.sum() > BestScore.sum()) 597 BestScore = Score; 598 } 599 600 // If we don't find a good splitting point, fallback to the original one. 601 if (BestScore.SplitIndex == size_t(-1)) 602 return HotColdIndex; 603 604 return BestScore.SplitIndex; 605 } 606 }; 607 608 struct SplitRandom2 final : public SplitStrategy { 609 std::minstd_rand0 Gen; 610 611 SplitRandom2() : Gen(opts::RandomSeed.getValue()) {} 612 613 bool canSplit(const BinaryFunction &BF) override { return true; } 614 615 bool compactFragments() override { return true; } 616 617 void fragment(const BlockIt Start, const BlockIt End) override { 618 using DiffT = typename std::iterator_traits<BlockIt>::difference_type; 619 const DiffT NumBlocks = End - Start; 620 assert(NumBlocks > 0 && "Cannot fragment empty function"); 621 622 // We want to split at least one block 623 const auto LastSplitPoint = std::max<DiffT>(NumBlocks - 1, 1); 624 std::uniform_int_distribution<DiffT> Dist(1, LastSplitPoint); 625 const DiffT SplitPoint = Dist(Gen); 626 for (BinaryBasicBlock *BB : llvm::make_range(Start + SplitPoint, End)) 627 BB->setFragmentNum(FragmentNum::cold()); 628 629 LLVM_DEBUG(dbgs() << formatv("BOLT-DEBUG: randomly chose last {0} (out of " 630 "{1} possible) blocks to split\n", 631 NumBlocks - SplitPoint, End - Start)); 632 } 633 }; 634 635 struct SplitRandomN final : public SplitStrategy { 636 std::minstd_rand0 Gen; 637 638 SplitRandomN() : Gen(opts::RandomSeed.getValue()) {} 639 640 bool canSplit(const BinaryFunction &BF) override { return true; } 641 642 bool compactFragments() override { return true; } 643 644 void fragment(const BlockIt Start, const BlockIt End) override { 645 using DiffT = typename std::iterator_traits<BlockIt>::difference_type; 646 const DiffT NumBlocks = End - Start; 647 assert(NumBlocks > 0 && "Cannot fragment empty function"); 648 649 // With n blocks, there are n-1 places to split them. 650 const DiffT MaximumSplits = NumBlocks - 1; 651 // We want to generate at least two fragment if possible, but if there is 652 // only one block, no splits are possible. 653 const auto MinimumSplits = std::min<DiffT>(MaximumSplits, 1); 654 std::uniform_int_distribution<DiffT> Dist(MinimumSplits, MaximumSplits); 655 // Choose how many splits to perform 656 const DiffT NumSplits = Dist(Gen); 657 658 // Draw split points from a lottery 659 SmallVector<unsigned, 0> Lottery(MaximumSplits); 660 // Start lottery at 1, because there is no meaningful splitpoint before the 661 // first block. 662 std::iota(Lottery.begin(), Lottery.end(), 1u); 663 std::shuffle(Lottery.begin(), Lottery.end(), Gen); 664 Lottery.resize(NumSplits); 665 llvm::sort(Lottery); 666 667 // Add one past the end entry to lottery 668 Lottery.push_back(NumBlocks); 669 670 unsigned LotteryIndex = 0; 671 unsigned BBPos = 0; 672 for (BinaryBasicBlock *const BB : make_range(Start, End)) { 673 // Check whether to start new fragment 674 if (BBPos >= Lottery[LotteryIndex]) 675 ++LotteryIndex; 676 677 // Because LotteryIndex is 0 based and cold fragments are 1 based, we can 678 // use the index to assign fragments. 679 BB->setFragmentNum(FragmentNum(LotteryIndex)); 680 681 ++BBPos; 682 } 683 } 684 }; 685 686 struct SplitAll final : public SplitStrategy { 687 bool canSplit(const BinaryFunction &BF) override { return true; } 688 689 bool compactFragments() override { 690 // Keeping empty fragments allows us to test, that empty fragments do not 691 // generate symbols. 692 return false; 693 } 694 695 void fragment(const BlockIt Start, const BlockIt End) override { 696 unsigned Fragment = 0; 697 for (BinaryBasicBlock *const BB : llvm::make_range(Start, End)) 698 BB->setFragmentNum(FragmentNum(Fragment++)); 699 } 700 }; 701 } // namespace 702 703 namespace llvm { 704 namespace bolt { 705 706 bool SplitFunctions::shouldOptimize(const BinaryFunction &BF) const { 707 // Apply execution count threshold 708 if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold) 709 return false; 710 711 return BinaryFunctionPass::shouldOptimize(BF); 712 } 713 714 Error SplitFunctions::runOnFunctions(BinaryContext &BC) { 715 if (!opts::SplitFunctions) 716 return Error::success(); 717 718 if (BC.IsLinuxKernel && BC.BOLTReserved.empty()) { 719 BC.errs() << "BOLT-ERROR: split functions require reserved space in the " 720 "Linux kernel binary\n"; 721 exit(1); 722 } 723 724 // If split strategy is not CDSplit, then a second run of the pass is not 725 // needed after function reordering. 726 if (BC.HasFinalizedFunctionOrder && 727 opts::SplitStrategy != SplitFunctionsStrategy::CDSplit) 728 return Error::success(); 729 730 std::unique_ptr<SplitStrategy> Strategy; 731 bool ForceSequential = false; 732 733 switch (opts::SplitStrategy) { 734 case SplitFunctionsStrategy::CDSplit: 735 // CDSplit runs two splitting passes: hot-cold splitting (SplitPrfoile2) 736 // before function reordering and hot-warm-cold splitting 737 // (SplitCacheDirected) after function reordering. 738 if (BC.HasFinalizedFunctionOrder) 739 Strategy = std::make_unique<SplitCacheDirected>(BC); 740 else 741 Strategy = std::make_unique<SplitProfile2>(); 742 opts::AggressiveSplitting = true; 743 BC.HasWarmSection = true; 744 break; 745 case SplitFunctionsStrategy::Profile2: 746 Strategy = std::make_unique<SplitProfile2>(); 747 break; 748 case SplitFunctionsStrategy::Random2: 749 Strategy = std::make_unique<SplitRandom2>(); 750 // If we split functions randomly, we need to ensure that across runs with 751 // the same input, we generate random numbers for each function in the same 752 // order. 753 ForceSequential = true; 754 break; 755 case SplitFunctionsStrategy::RandomN: 756 Strategy = std::make_unique<SplitRandomN>(); 757 ForceSequential = true; 758 break; 759 case SplitFunctionsStrategy::All: 760 Strategy = std::make_unique<SplitAll>(); 761 break; 762 } 763 764 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 765 return !shouldOptimize(BF); 766 }; 767 768 ParallelUtilities::runOnEachFunction( 769 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, 770 [&](BinaryFunction &BF) { splitFunction(BF, *Strategy); }, SkipFunc, 771 "SplitFunctions", ForceSequential); 772 773 if (SplitBytesHot + SplitBytesCold > 0) 774 BC.outs() << "BOLT-INFO: splitting separates " << SplitBytesHot 775 << " hot bytes from " << SplitBytesCold << " cold bytes " 776 << format("(%.2lf%% of split functions is hot).\n", 777 100.0 * SplitBytesHot / 778 (SplitBytesHot + SplitBytesCold)); 779 return Error::success(); 780 } 781 782 void SplitFunctions::splitFunction(BinaryFunction &BF, SplitStrategy &S) { 783 if (BF.empty()) 784 return; 785 786 if (!S.canSplit(BF)) 787 return; 788 789 FunctionLayout &Layout = BF.getLayout(); 790 BinaryFunction::BasicBlockOrderType PreSplitLayout(Layout.block_begin(), 791 Layout.block_end()); 792 793 BinaryContext &BC = BF.getBinaryContext(); 794 size_t OriginalHotSize; 795 size_t HotSize; 796 size_t ColdSize; 797 if (BC.isX86()) { 798 std::tie(OriginalHotSize, ColdSize) = BC.calculateEmittedSize(BF); 799 LLVM_DEBUG(dbgs() << "Estimated size for function " << BF 800 << " pre-split is <0x" 801 << Twine::utohexstr(OriginalHotSize) << ", 0x" 802 << Twine::utohexstr(ColdSize) << ">\n"); 803 } 804 805 BinaryFunction::BasicBlockOrderType NewLayout(Layout.block_begin(), 806 Layout.block_end()); 807 // Never outline the first basic block. 808 NewLayout.front()->setCanOutline(false); 809 for (BinaryBasicBlock *const BB : NewLayout) { 810 if (!BB->canOutline()) 811 continue; 812 813 // Do not split extra entry points in aarch64. They can be referred by 814 // using ADRs and when this happens, these blocks cannot be placed far 815 // away due to the limited range in ADR instruction. 816 if (BC.isAArch64() && BB->isEntryPoint()) { 817 BB->setCanOutline(false); 818 continue; 819 } 820 821 if (BF.hasEHRanges() && !opts::SplitEH) { 822 // We cannot move landing pads (or rather entry points for landing pads). 823 if (BB->isLandingPad()) { 824 BB->setCanOutline(false); 825 continue; 826 } 827 // We cannot move a block that can throw since exception-handling 828 // runtime cannot deal with split functions. However, if we can guarantee 829 // that the block never throws, it is safe to move the block to 830 // decrease the size of the function. 831 for (MCInst &Instr : *BB) { 832 if (BC.MIB->isInvoke(Instr)) { 833 BB->setCanOutline(false); 834 break; 835 } 836 } 837 } 838 839 // Outlining blocks with dynamic branches is not supported yet. 840 if (BC.IsLinuxKernel) { 841 if (llvm::any_of( 842 *BB, [&](MCInst &Inst) { return BC.MIB->isDynamicBranch(Inst); })) 843 BB->setCanOutline(false); 844 } 845 } 846 847 BF.getLayout().updateLayoutIndices(); 848 S.fragment(NewLayout.begin(), NewLayout.end()); 849 850 // Make sure all non-outlineable blocks are in the main-fragment. 851 for (BinaryBasicBlock *const BB : NewLayout) { 852 if (!BB->canOutline()) 853 BB->setFragmentNum(FragmentNum::main()); 854 } 855 856 if (opts::AggressiveSplitting) { 857 // All blocks with 0 count that we can move go to the end of the function. 858 // Even if they were natural to cluster formation and were seen in-between 859 // hot basic blocks. 860 llvm::stable_sort(NewLayout, [&](const BinaryBasicBlock *const A, 861 const BinaryBasicBlock *const B) { 862 return A->getFragmentNum() < B->getFragmentNum(); 863 }); 864 } else if (BF.hasEHRanges() && !opts::SplitEH) { 865 // Typically functions with exception handling have landing pads at the end. 866 // We cannot move beginning of landing pads, but we can move 0-count blocks 867 // comprising landing pads to the end and thus facilitate splitting. 868 auto FirstLP = NewLayout.begin(); 869 while ((*FirstLP)->isLandingPad()) 870 ++FirstLP; 871 872 std::stable_sort(FirstLP, NewLayout.end(), 873 [&](BinaryBasicBlock *A, BinaryBasicBlock *B) { 874 return A->getFragmentNum() < B->getFragmentNum(); 875 }); 876 } 877 878 // Make sure that fragments are increasing. 879 FragmentNum CurrentFragment = NewLayout.back()->getFragmentNum(); 880 for (BinaryBasicBlock *const BB : reverse(NewLayout)) { 881 if (BB->getFragmentNum() > CurrentFragment) 882 BB->setFragmentNum(CurrentFragment); 883 CurrentFragment = BB->getFragmentNum(); 884 } 885 886 if (S.compactFragments()) { 887 FragmentNum CurrentFragment = FragmentNum::main(); 888 FragmentNum NewFragment = FragmentNum::main(); 889 for (BinaryBasicBlock *const BB : NewLayout) { 890 if (BB->getFragmentNum() > CurrentFragment) { 891 CurrentFragment = BB->getFragmentNum(); 892 NewFragment = FragmentNum(NewFragment.get() + 1); 893 } 894 BB->setFragmentNum(NewFragment); 895 } 896 } 897 898 const bool LayoutUpdated = BF.getLayout().update(NewLayout); 899 900 // For shared objects, invoke instructions and corresponding landing pads 901 // have to be placed in the same fragment. When we split them, create 902 // trampoline landing pads that will redirect the execution to real LPs. 903 TrampolineSetType Trampolines; 904 if (BF.hasEHRanges() && BF.isSplit()) { 905 // If all landing pads for this fragment are grouped in one (potentially 906 // different) fragment, we can set LPStart to the start of that fragment 907 // and avoid trampoline code. 908 bool NeedsTrampolines = false; 909 for (FunctionFragment &FF : BF.getLayout().fragments()) { 910 // Vector of fragments that contain landing pads for this fragment. 911 SmallVector<FragmentNum, 4> LandingPadFragments; 912 for (const BinaryBasicBlock *BB : FF) 913 for (const BinaryBasicBlock *LPB : BB->landing_pads()) 914 LandingPadFragments.push_back(LPB->getFragmentNum()); 915 916 // Eliminate duplicate entries from the vector. 917 llvm::sort(LandingPadFragments); 918 auto Last = llvm::unique(LandingPadFragments); 919 LandingPadFragments.erase(Last, LandingPadFragments.end()); 920 921 if (LandingPadFragments.size() == 0) { 922 // If the fragment has no landing pads, we can safely set itself as its 923 // landing pad fragment. 924 BF.setLPFragment(FF.getFragmentNum(), FF.getFragmentNum()); 925 } else if (LandingPadFragments.size() == 1) { 926 BF.setLPFragment(FF.getFragmentNum(), LandingPadFragments.front()); 927 } else { 928 if (!BC.HasFixedLoadAddress) { 929 NeedsTrampolines = true; 930 break; 931 } else { 932 BF.setLPFragment(FF.getFragmentNum(), std::nullopt); 933 } 934 } 935 } 936 937 // Trampolines guarantee that all landing pads for any given fragment will 938 // be contained in the same fragment. 939 if (NeedsTrampolines) { 940 for (FunctionFragment &FF : BF.getLayout().fragments()) 941 BF.setLPFragment(FF.getFragmentNum(), FF.getFragmentNum()); 942 Trampolines = createEHTrampolines(BF); 943 } 944 } 945 946 // Check the new size to see if it's worth splitting the function. 947 if (BC.isX86() && LayoutUpdated) { 948 std::tie(HotSize, ColdSize) = BC.calculateEmittedSize(BF); 949 LLVM_DEBUG(dbgs() << "Estimated size for function " << BF 950 << " post-split is <0x" << Twine::utohexstr(HotSize) 951 << ", 0x" << Twine::utohexstr(ColdSize) << ">\n"); 952 if (alignTo(OriginalHotSize, opts::SplitAlignThreshold) <= 953 alignTo(HotSize, opts::SplitAlignThreshold) + opts::SplitThreshold) { 954 if (opts::Verbosity >= 2) { 955 BC.outs() << "BOLT-INFO: Reversing splitting of function " 956 << formatv("{0}:\n {1:x}, {2:x} -> {3:x}\n", BF, HotSize, 957 ColdSize, OriginalHotSize); 958 } 959 960 // Reverse the action of createEHTrampolines(). The trampolines will be 961 // placed immediately before the matching destination resulting in no 962 // extra code. 963 if (PreSplitLayout.size() != BF.size()) 964 PreSplitLayout = mergeEHTrampolines(BF, PreSplitLayout, Trampolines); 965 966 for (BinaryBasicBlock &BB : BF) 967 BB.setFragmentNum(FragmentNum::main()); 968 BF.getLayout().update(PreSplitLayout); 969 } else { 970 SplitBytesHot += HotSize; 971 SplitBytesCold += ColdSize; 972 } 973 } 974 975 // Restore LP fragment for the main fragment if the splitting was undone. 976 if (BF.hasEHRanges() && !BF.isSplit()) 977 BF.setLPFragment(FragmentNum::main(), FragmentNum::main()); 978 979 // Fix branches if the splitting decision of the pass after function 980 // reordering is different from that of the pass before function reordering. 981 if (LayoutUpdated && BC.HasFinalizedFunctionOrder) 982 BF.fixBranches(); 983 } 984 985 SplitFunctions::TrampolineSetType 986 SplitFunctions::createEHTrampolines(BinaryFunction &BF) const { 987 const auto &MIB = BF.getBinaryContext().MIB; 988 989 // Map real landing pads to the corresponding trampolines. 990 TrampolineSetType LPTrampolines; 991 992 // Iterate over the copy of basic blocks since we are adding new blocks to the 993 // function which will invalidate its iterators. 994 std::vector<BinaryBasicBlock *> Blocks(BF.pbegin(), BF.pend()); 995 for (BinaryBasicBlock *BB : Blocks) { 996 for (MCInst &Instr : *BB) { 997 const std::optional<MCPlus::MCLandingPad> EHInfo = MIB->getEHInfo(Instr); 998 if (!EHInfo || !EHInfo->first) 999 continue; 1000 1001 const MCSymbol *LPLabel = EHInfo->first; 1002 BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(LPLabel); 1003 if (BB->getFragmentNum() == LPBlock->getFragmentNum()) 1004 continue; 1005 1006 const MCSymbol *TrampolineLabel = nullptr; 1007 const TrampolineKey Key(BB->getFragmentNum(), LPLabel); 1008 auto Iter = LPTrampolines.find(Key); 1009 if (Iter != LPTrampolines.end()) { 1010 TrampolineLabel = Iter->second; 1011 } else { 1012 // Create a trampoline basic block in the same fragment as the thrower. 1013 // Note: there's no need to insert the jump instruction, it will be 1014 // added by fixBranches(). 1015 BinaryBasicBlock *TrampolineBB = BF.addBasicBlock(); 1016 TrampolineBB->setFragmentNum(BB->getFragmentNum()); 1017 TrampolineBB->setExecutionCount(LPBlock->getExecutionCount()); 1018 TrampolineBB->addSuccessor(LPBlock, TrampolineBB->getExecutionCount()); 1019 TrampolineBB->setCFIState(LPBlock->getCFIState()); 1020 TrampolineLabel = TrampolineBB->getLabel(); 1021 LPTrampolines.insert(std::make_pair(Key, TrampolineLabel)); 1022 } 1023 1024 // Substitute the landing pad with the trampoline. 1025 MIB->updateEHInfo(Instr, 1026 MCPlus::MCLandingPad(TrampolineLabel, EHInfo->second)); 1027 } 1028 } 1029 1030 if (LPTrampolines.empty()) 1031 return LPTrampolines; 1032 1033 // All trampoline blocks were added to the end of the function. Place them at 1034 // the end of corresponding fragments. 1035 BinaryFunction::BasicBlockOrderType NewLayout(BF.getLayout().block_begin(), 1036 BF.getLayout().block_end()); 1037 stable_sort(NewLayout, [&](BinaryBasicBlock *A, BinaryBasicBlock *B) { 1038 return A->getFragmentNum() < B->getFragmentNum(); 1039 }); 1040 BF.getLayout().update(NewLayout); 1041 1042 // Conservatively introduce branch instructions. 1043 BF.fixBranches(); 1044 1045 // Update exception-handling CFG for the function. 1046 BF.recomputeLandingPads(); 1047 1048 return LPTrampolines; 1049 } 1050 1051 SplitFunctions::BasicBlockOrderType SplitFunctions::mergeEHTrampolines( 1052 BinaryFunction &BF, SplitFunctions::BasicBlockOrderType &Layout, 1053 const SplitFunctions::TrampolineSetType &Trampolines) const { 1054 DenseMap<const MCSymbol *, SmallVector<const MCSymbol *, 0>> 1055 IncomingTrampolines; 1056 for (const auto &Entry : Trampolines) { 1057 IncomingTrampolines[Entry.getFirst().Target].emplace_back( 1058 Entry.getSecond()); 1059 } 1060 1061 BasicBlockOrderType MergedLayout; 1062 for (BinaryBasicBlock *BB : Layout) { 1063 auto Iter = IncomingTrampolines.find(BB->getLabel()); 1064 if (Iter != IncomingTrampolines.end()) { 1065 for (const MCSymbol *const Trampoline : Iter->getSecond()) { 1066 BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(Trampoline); 1067 assert(LPBlock && "Could not find matching landing pad block."); 1068 MergedLayout.push_back(LPBlock); 1069 } 1070 } 1071 MergedLayout.push_back(BB); 1072 } 1073 1074 return MergedLayout; 1075 } 1076 1077 } // namespace bolt 1078 } // namespace llvm 1079