1 //===- bolt/Passes/SplitFunctions.cpp - Pass for splitting function code --===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the SplitFunctions pass. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "bolt/Passes/SplitFunctions.h" 14 #include "bolt/Core/BinaryBasicBlock.h" 15 #include "bolt/Core/BinaryFunction.h" 16 #include "bolt/Core/FunctionLayout.h" 17 #include "bolt/Core/ParallelUtilities.h" 18 #include "bolt/Utils/CommandLineOpts.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/Sequence.h" 21 #include "llvm/ADT/SmallVector.h" 22 #include "llvm/ADT/iterator_range.h" 23 #include "llvm/Support/CommandLine.h" 24 #include "llvm/Support/FormatVariadic.h" 25 #include <algorithm> 26 #include <iterator> 27 #include <memory> 28 #include <numeric> 29 #include <random> 30 #include <vector> 31 32 #define DEBUG_TYPE "bolt-opts" 33 34 using namespace llvm; 35 using namespace bolt; 36 37 namespace { 38 class DeprecatedSplitFunctionOptionParser : public cl::parser<bool> { 39 public: 40 explicit DeprecatedSplitFunctionOptionParser(cl::Option &O) 41 : cl::parser<bool>(O) {} 42 43 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, bool &Value) { 44 if (Arg == "2" || Arg == "3") { 45 Value = true; 46 errs() << formatv("BOLT-WARNING: specifying non-boolean value \"{0}\" " 47 "for option -{1} is deprecated\n", 48 Arg, ArgName); 49 return false; 50 } 51 return cl::parser<bool>::parse(O, ArgName, Arg, Value); 52 } 53 }; 54 } // namespace 55 56 namespace opts { 57 58 extern cl::OptionCategory BoltOptCategory; 59 60 extern cl::opt<bool> SplitEH; 61 extern cl::opt<unsigned> ExecutionCountThreshold; 62 extern cl::opt<uint32_t> RandomSeed; 63 64 static cl::opt<bool> AggressiveSplitting( 65 "split-all-cold", cl::desc("outline as many cold basic blocks as possible"), 66 cl::cat(BoltOptCategory)); 67 68 static cl::opt<unsigned> SplitAlignThreshold( 69 "split-align-threshold", 70 cl::desc("when deciding to split a function, apply this alignment " 71 "while doing the size comparison (see -split-threshold). " 72 "Default value: 2."), 73 cl::init(2), 74 75 cl::Hidden, cl::cat(BoltOptCategory)); 76 77 static cl::opt<bool, false, DeprecatedSplitFunctionOptionParser> 78 SplitFunctions("split-functions", 79 cl::desc("split functions into fragments"), 80 cl::cat(BoltOptCategory)); 81 82 static cl::opt<unsigned> SplitThreshold( 83 "split-threshold", 84 cl::desc("split function only if its main size is reduced by more than " 85 "given amount of bytes. Default value: 0, i.e. split iff the " 86 "size is reduced. Note that on some architectures the size can " 87 "increase after splitting."), 88 cl::init(0), cl::Hidden, cl::cat(BoltOptCategory)); 89 90 static cl::opt<SplitFunctionsStrategy> SplitStrategy( 91 "split-strategy", cl::init(SplitFunctionsStrategy::Profile2), 92 cl::values(clEnumValN(SplitFunctionsStrategy::Profile2, "profile2", 93 "split each function into a hot and cold fragment " 94 "using profiling information")), 95 cl::values(clEnumValN(SplitFunctionsStrategy::CDSplit, "cdsplit", 96 "split each function into a hot, warm, and cold " 97 "fragment using profiling information")), 98 cl::values(clEnumValN( 99 SplitFunctionsStrategy::Random2, "random2", 100 "split each function into a hot and cold fragment at a randomly chosen " 101 "split point (ignoring any available profiling information)")), 102 cl::values(clEnumValN( 103 SplitFunctionsStrategy::RandomN, "randomN", 104 "split each function into N fragments at a randomly chosen split " 105 "points (ignoring any available profiling information)")), 106 cl::values(clEnumValN( 107 SplitFunctionsStrategy::All, "all", 108 "split all basic blocks of each function into fragments such that each " 109 "fragment contains exactly a single basic block")), 110 cl::desc("strategy used to partition blocks into fragments"), 111 cl::cat(BoltOptCategory)); 112 113 static cl::opt<double> CallScale( 114 "call-scale", 115 cl::desc("Call score scale coefficient (when --split-strategy=cdsplit)"), 116 cl::init(0.95), cl::ReallyHidden, cl::cat(BoltOptCategory)); 117 118 static cl::opt<double> 119 CallPower("call-power", 120 cl::desc("Call score power (when --split-strategy=cdsplit)"), 121 cl::init(0.05), cl::ReallyHidden, cl::cat(BoltOptCategory)); 122 123 static cl::opt<double> 124 JumpPower("jump-power", 125 cl::desc("Jump score power (when --split-strategy=cdsplit)"), 126 cl::init(0.15), cl::ReallyHidden, cl::cat(BoltOptCategory)); 127 } // namespace opts 128 129 namespace { 130 bool hasFullProfile(const BinaryFunction &BF) { 131 return llvm::all_of(BF.blocks(), [](const BinaryBasicBlock &BB) { 132 return BB.getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE; 133 }); 134 } 135 136 bool allBlocksCold(const BinaryFunction &BF) { 137 return llvm::all_of(BF.blocks(), [](const BinaryBasicBlock &BB) { 138 return BB.getExecutionCount() == 0; 139 }); 140 } 141 142 struct SplitProfile2 final : public SplitStrategy { 143 bool canSplit(const BinaryFunction &BF) override { 144 return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF); 145 } 146 147 bool compactFragments() override { return true; } 148 149 void fragment(const BlockIt Start, const BlockIt End) override { 150 for (BinaryBasicBlock *const BB : llvm::make_range(Start, End)) { 151 if (BB->getExecutionCount() == 0) 152 BB->setFragmentNum(FragmentNum::cold()); 153 } 154 } 155 }; 156 157 struct SplitCacheDirected final : public SplitStrategy { 158 BinaryContext &BC; 159 using BasicBlockOrder = BinaryFunction::BasicBlockOrderType; 160 161 bool canSplit(const BinaryFunction &BF) override { 162 return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF); 163 } 164 165 explicit SplitCacheDirected(BinaryContext &BC) : BC(BC) { 166 initializeAuxiliaryVariables(); 167 buildCallGraph(); 168 } 169 170 // When some functions are hot-warm split and others are hot-warm-cold split, 171 // we do not want to change the fragment numbers of the blocks in the hot-warm 172 // split functions. 173 bool compactFragments() override { return false; } 174 175 void fragment(const BlockIt Start, const BlockIt End) override { 176 BasicBlockOrder BlockOrder(Start, End); 177 BinaryFunction &BF = *BlockOrder.front()->getFunction(); 178 // No need to re-split small functions. 179 if (BlockOrder.size() <= 2) 180 return; 181 182 size_t BestSplitIndex = findSplitIndex(BF, BlockOrder); 183 assert(BestSplitIndex < BlockOrder.size()); 184 185 // Assign fragments based on the computed best split index. 186 // All basic blocks with index up to the best split index become hot. 187 // All remaining blocks are warm / cold depending on if count is 188 // greater than zero or not. 189 for (size_t Index = 0; Index < BlockOrder.size(); Index++) { 190 BinaryBasicBlock *BB = BlockOrder[Index]; 191 if (Index <= BestSplitIndex) 192 BB->setFragmentNum(FragmentNum::main()); 193 else 194 BB->setFragmentNum(BB->getKnownExecutionCount() > 0 195 ? FragmentNum::warm() 196 : FragmentNum::cold()); 197 } 198 } 199 200 private: 201 struct CallInfo { 202 size_t Length; 203 size_t Count; 204 }; 205 206 struct SplitScore { 207 size_t SplitIndex = size_t(-1); 208 size_t HotSizeReduction = 0; 209 double LocalScore = 0; 210 double CoverCallScore = 0; 211 212 double sum() const { return LocalScore + CoverCallScore; } 213 }; 214 215 // Auxiliary variables used by the algorithm. 216 size_t TotalNumBlocks{0}; 217 size_t OrigHotSectionSize{0}; 218 DenseMap<const BinaryBasicBlock *, size_t> GlobalIndices; 219 DenseMap<const BinaryBasicBlock *, size_t> BBSizes; 220 DenseMap<const BinaryBasicBlock *, size_t> BBOffsets; 221 222 // Call graph. 223 std::vector<SmallVector<const BinaryBasicBlock *, 0>> Callers; 224 std::vector<SmallVector<const BinaryBasicBlock *, 0>> Callees; 225 226 bool shouldConsiderForCallGraph(const BinaryFunction &BF) { 227 // Only a subset of the functions in the binary will be considered 228 // for initializing auxiliary variables and building call graph. 229 return BF.hasValidIndex() && BF.hasValidProfile() && !BF.empty(); 230 } 231 232 void initializeAuxiliaryVariables() { 233 for (BinaryFunction *BF : BC.getSortedFunctions()) { 234 if (!shouldConsiderForCallGraph(*BF)) 235 continue; 236 237 // Calculate the size of each BB after hot-cold splitting. 238 // This populates BinaryBasicBlock::OutputAddressRange which 239 // can be used to compute the size of each BB. 240 BC.calculateEmittedSize(*BF, /*FixBranches=*/true); 241 242 for (BinaryBasicBlock *BB : BF->getLayout().blocks()) { 243 // Unique global index. 244 GlobalIndices[BB] = TotalNumBlocks; 245 TotalNumBlocks++; 246 247 // Block size after hot-cold splitting. 248 BBSizes[BB] = BB->getOutputSize(); 249 250 // Hot block offset after hot-cold splitting. 251 BBOffsets[BB] = OrigHotSectionSize; 252 if (!BB->isSplit()) 253 OrigHotSectionSize += BBSizes[BB]; 254 } 255 } 256 } 257 258 void buildCallGraph() { 259 Callers.resize(TotalNumBlocks); 260 Callees.resize(TotalNumBlocks); 261 for (const BinaryFunction *SrcFunction : BC.getSortedFunctions()) { 262 if (!shouldConsiderForCallGraph(*SrcFunction)) 263 continue; 264 265 for (BinaryBasicBlock &SrcBB : SrcFunction->blocks()) { 266 // Skip blocks that are not executed 267 if (SrcBB.getKnownExecutionCount() == 0) 268 continue; 269 270 // Find call instructions and extract target symbols from each one 271 for (const MCInst &Inst : SrcBB) { 272 if (!BC.MIB->isCall(Inst)) 273 continue; 274 275 // Call info 276 const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst); 277 // Ignore calls w/o information 278 if (!DstSym) 279 continue; 280 281 const BinaryFunction *DstFunction = BC.getFunctionForSymbol(DstSym); 282 // Ignore calls that do not have a valid target, but do not ignore 283 // recursive calls, because caller block could be moved to warm. 284 if (!DstFunction || DstFunction->getLayout().block_empty()) 285 continue; 286 287 const BinaryBasicBlock *DstBB = &(DstFunction->front()); 288 289 // Record the call only if DstBB is also in functions to consider for 290 // call graph. 291 if (GlobalIndices.contains(DstBB)) { 292 Callers[GlobalIndices[DstBB]].push_back(&SrcBB); 293 Callees[GlobalIndices[&SrcBB]].push_back(DstBB); 294 } 295 } 296 } 297 } 298 } 299 300 /// Populate BinaryBasicBlock::OutputAddressRange with estimated basic block 301 /// start and end addresses for hot and warm basic blocks, assuming hot-warm 302 /// splitting happens at \p SplitIndex. Also return estimated end addresses 303 /// of the hot fragment before and after splitting. 304 /// The estimations take into account the potential addition of branch 305 /// instructions due to split fall through branches as well as the need to 306 /// use longer branch instructions for split (un)conditional branches. 307 std::pair<size_t, size_t> 308 estimatePostSplitBBAddress(const BasicBlockOrder &BlockOrder, 309 const size_t SplitIndex) { 310 assert(SplitIndex < BlockOrder.size() && "Invalid split index"); 311 312 // Update function layout assuming hot-warm splitting at SplitIndex. 313 for (size_t Index = 0; Index < BlockOrder.size(); Index++) { 314 BinaryBasicBlock *BB = BlockOrder[Index]; 315 if (BB->getFragmentNum() == FragmentNum::cold()) 316 break; 317 BB->setFragmentNum(Index <= SplitIndex ? FragmentNum::main() 318 : FragmentNum::warm()); 319 } 320 BinaryFunction *BF = BlockOrder[0]->getFunction(); 321 BF->getLayout().update(BlockOrder); 322 // Populate BB.OutputAddressRange under the updated layout. 323 BC.calculateEmittedSize(*BF); 324 325 // Populate BB.OutputAddressRange with estimated new start and end addresses 326 // and compute the old end address of the hot section and the new end 327 // address of the hot section. 328 size_t OldHotEndAddr{0}; 329 size_t NewHotEndAddr{0}; 330 size_t CurrentAddr = BBOffsets[BlockOrder[0]]; 331 for (BinaryBasicBlock *BB : BlockOrder) { 332 // We only care about new addresses of blocks in hot/warm. 333 if (BB->getFragmentNum() == FragmentNum::cold()) 334 break; 335 const size_t NewSize = BB->getOutputSize(); 336 BB->setOutputStartAddress(CurrentAddr); 337 CurrentAddr += NewSize; 338 BB->setOutputEndAddress(CurrentAddr); 339 if (BB->getLayoutIndex() == SplitIndex) { 340 NewHotEndAddr = CurrentAddr; 341 // Approximate the start address of the warm fragment of the current 342 // function using the original hot section size. 343 CurrentAddr = OrigHotSectionSize; 344 } 345 OldHotEndAddr = BBOffsets[BB] + BBSizes[BB]; 346 } 347 return std::make_pair(OldHotEndAddr, NewHotEndAddr); 348 } 349 350 /// Get a collection of "shortenable" calls, that is, calls of type X->Y 351 /// when the function order is [... X ... BF ... Y ...]. 352 /// If the hot fragment size of BF is reduced, then such calls are guaranteed 353 /// to get shorter by the reduced hot fragment size. 354 std::vector<CallInfo> extractCoverCalls(const BinaryFunction &BF) { 355 // Record the length and the count of the calls that can be shortened 356 std::vector<CallInfo> CoverCalls; 357 if (opts::CallScale == 0) 358 return CoverCalls; 359 360 const BinaryFunction *ThisBF = &BF; 361 const BinaryBasicBlock *ThisBB = &(ThisBF->front()); 362 const size_t ThisGI = GlobalIndices[ThisBB]; 363 364 for (const BinaryFunction *DstBF : BC.getSortedFunctions()) { 365 if (!shouldConsiderForCallGraph(*DstBF)) 366 continue; 367 368 const BinaryBasicBlock *DstBB = &(DstBF->front()); 369 if (DstBB->getKnownExecutionCount() == 0) 370 continue; 371 372 const size_t DstGI = GlobalIndices[DstBB]; 373 for (const BinaryBasicBlock *SrcBB : Callers[DstGI]) { 374 const BinaryFunction *SrcBF = SrcBB->getFunction(); 375 if (ThisBF == SrcBF) 376 continue; 377 378 const size_t CallCount = SrcBB->getKnownExecutionCount(); 379 380 const size_t SrcGI = GlobalIndices[SrcBB]; 381 382 const bool IsCoverCall = (SrcGI < ThisGI && ThisGI < DstGI) || 383 (DstGI <= ThisGI && ThisGI < SrcGI); 384 if (!IsCoverCall) 385 continue; 386 387 const size_t SrcBBEndAddr = BBOffsets[SrcBB] + BBSizes[SrcBB]; 388 const size_t DstBBStartAddr = BBOffsets[DstBB]; 389 const size_t CallLength = 390 AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr); 391 const CallInfo CI{CallLength, CallCount}; 392 CoverCalls.emplace_back(CI); 393 } 394 } 395 return CoverCalls; 396 } 397 398 /// Compute the edge score of a call edge. 399 double computeCallScore(uint64_t CallCount, size_t CallLength) { 400 // Increase call lengths by 1 to avoid raising 0 to a negative power. 401 return opts::CallScale * static_cast<double>(CallCount) / 402 std::pow(static_cast<double>(CallLength + 1), opts::CallPower); 403 } 404 405 /// Compute the edge score of a jump (branch) edge. 406 double computeJumpScore(uint64_t JumpCount, size_t JumpLength) { 407 // Increase jump lengths by 1 to avoid raising 0 to a negative power. 408 return static_cast<double>(JumpCount) / 409 std::pow(static_cast<double>(JumpLength + 1), opts::JumpPower); 410 } 411 412 /// Compute sum of scores over jumps within \p BlockOrder given \p SplitIndex. 413 /// Increament Score.LocalScore in place by the sum. 414 void computeJumpScore(const BasicBlockOrder &BlockOrder, 415 const size_t SplitIndex, SplitScore &Score) { 416 417 for (const BinaryBasicBlock *SrcBB : BlockOrder) { 418 if (SrcBB->getKnownExecutionCount() == 0) 419 continue; 420 421 const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second; 422 423 for (const auto Pair : zip(SrcBB->successors(), SrcBB->branch_info())) { 424 const BinaryBasicBlock *DstBB = std::get<0>(Pair); 425 const BinaryBasicBlock::BinaryBranchInfo &Branch = std::get<1>(Pair); 426 const size_t JumpCount = Branch.Count; 427 428 if (JumpCount == 0) 429 continue; 430 431 const size_t DstBBStartAddr = DstBB->getOutputAddressRange().first; 432 const size_t NewJumpLength = 433 AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr); 434 Score.LocalScore += computeJumpScore(JumpCount, NewJumpLength); 435 } 436 } 437 } 438 439 /// Compute sum of scores over calls originated in the current function 440 /// given \p SplitIndex. Increament Score.LocalScore in place by the sum. 441 void computeLocalCallScore(const BasicBlockOrder &BlockOrder, 442 const size_t SplitIndex, SplitScore &Score) { 443 if (opts::CallScale == 0) 444 return; 445 446 // Global index of the last block in the current function. 447 // This is later used to determine whether a call originated in the current 448 // function is to a function that comes after the current function. 449 const size_t LastGlobalIndex = GlobalIndices[BlockOrder.back()]; 450 451 // The length of calls originated in the input function can increase / 452 // decrease depending on the splitting decision. 453 for (const BinaryBasicBlock *SrcBB : BlockOrder) { 454 const size_t CallCount = SrcBB->getKnownExecutionCount(); 455 // If SrcBB does not call any functions, skip it. 456 if (CallCount == 0) 457 continue; 458 459 // Obtain an estimate on the end address of the src basic block 460 // after splitting at SplitIndex. 461 const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second; 462 463 for (const BinaryBasicBlock *DstBB : Callees[GlobalIndices[SrcBB]]) { 464 // Obtain an estimate on the start address of the dst basic block 465 // after splitting at SplitIndex. If DstBB is in a function before 466 // the current function, then its start address remains unchanged. 467 size_t DstBBStartAddr = BBOffsets[DstBB]; 468 // If DstBB is in a function after the current function, then its 469 // start address should be adjusted based on the reduction in hot size. 470 if (GlobalIndices[DstBB] > LastGlobalIndex) { 471 assert(DstBBStartAddr >= Score.HotSizeReduction); 472 DstBBStartAddr -= Score.HotSizeReduction; 473 } 474 const size_t NewCallLength = 475 AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr); 476 Score.LocalScore += computeCallScore(CallCount, NewCallLength); 477 } 478 } 479 } 480 481 /// Compute sum of splitting scores for cover calls of the input function. 482 /// Increament Score.CoverCallScore in place by the sum. 483 void computeCoverCallScore(const BasicBlockOrder &BlockOrder, 484 const size_t SplitIndex, 485 const std::vector<CallInfo> &CoverCalls, 486 SplitScore &Score) { 487 if (opts::CallScale == 0) 488 return; 489 490 for (const CallInfo CI : CoverCalls) { 491 assert(CI.Length >= Score.HotSizeReduction && 492 "Length of cover calls must exceed reduced size of hot fragment."); 493 // Compute the new length of the call, which is shorter than the original 494 // one by the size of the splitted fragment minus the total size increase. 495 const size_t NewCallLength = CI.Length - Score.HotSizeReduction; 496 Score.CoverCallScore += computeCallScore(CI.Count, NewCallLength); 497 } 498 } 499 500 /// Compute the split score of splitting a function at a given index. 501 /// The split score consists of local score and cover score. This function 502 /// returns \p Score of SplitScore type. It contains the local score and 503 /// cover score of the current splitting index. For easier book keeping and 504 /// comparison, it also stores the split index and the resulting reduction 505 /// in hot fragment size. 506 SplitScore computeSplitScore(const BinaryFunction &BF, 507 const BasicBlockOrder &BlockOrder, 508 const size_t SplitIndex, 509 const std::vector<CallInfo> &CoverCalls) { 510 // Populate BinaryBasicBlock::OutputAddressRange with estimated 511 // new start and end addresses after hot-warm splitting at SplitIndex. 512 size_t OldHotEnd; 513 size_t NewHotEnd; 514 std::tie(OldHotEnd, NewHotEnd) = 515 estimatePostSplitBBAddress(BlockOrder, SplitIndex); 516 517 SplitScore Score; 518 Score.SplitIndex = SplitIndex; 519 520 // It's not worth splitting if OldHotEnd < NewHotEnd. 521 if (OldHotEnd < NewHotEnd) 522 return Score; 523 524 // Hot fragment size reduction due to splitting. 525 Score.HotSizeReduction = OldHotEnd - NewHotEnd; 526 527 // First part of LocalScore is the sum over call edges originated in the 528 // input function. These edges can get shorter or longer depending on 529 // SplitIndex. Score.LocalScore is increamented in place. 530 computeLocalCallScore(BlockOrder, SplitIndex, Score); 531 532 // Second part of LocalScore is the sum over jump edges with src basic block 533 // and dst basic block in the current function. Score.LocalScore is 534 // increamented in place. 535 computeJumpScore(BlockOrder, SplitIndex, Score); 536 537 // Compute CoverCallScore and store in Score in place. 538 computeCoverCallScore(BlockOrder, SplitIndex, CoverCalls, Score); 539 return Score; 540 } 541 542 /// Find the most likely successor of a basic block when it has one or two 543 /// successors. Return nullptr otherwise. 544 const BinaryBasicBlock *getMostLikelySuccessor(const BinaryBasicBlock *BB) { 545 if (BB->succ_size() == 1) 546 return BB->getSuccessor(); 547 if (BB->succ_size() == 2) { 548 uint64_t TakenCount = BB->getTakenBranchInfo().Count; 549 assert(TakenCount != BinaryBasicBlock::COUNT_NO_PROFILE); 550 uint64_t NonTakenCount = BB->getFallthroughBranchInfo().Count; 551 assert(NonTakenCount != BinaryBasicBlock::COUNT_NO_PROFILE); 552 if (TakenCount > NonTakenCount) 553 return BB->getConditionalSuccessor(true); 554 else if (TakenCount < NonTakenCount) 555 return BB->getConditionalSuccessor(false); 556 } 557 return nullptr; 558 } 559 560 /// Find the best index for splitting. The returned value is the index of the 561 /// last hot basic block. Hence, "no splitting" is equivalent to returning the 562 /// value which is one less than the size of the function. 563 size_t findSplitIndex(const BinaryFunction &BF, 564 const BasicBlockOrder &BlockOrder) { 565 assert(BlockOrder.size() > 2); 566 // Find all function calls that can be shortened if we move blocks of the 567 // current function to warm/cold 568 const std::vector<CallInfo> CoverCalls = extractCoverCalls(BF); 569 570 // Find the existing hot-cold splitting index. 571 size_t HotColdIndex = 0; 572 while (HotColdIndex + 1 < BlockOrder.size()) { 573 if (BlockOrder[HotColdIndex + 1]->getFragmentNum() == FragmentNum::cold()) 574 break; 575 HotColdIndex++; 576 } 577 assert(HotColdIndex + 1 == BlockOrder.size() || 578 (BlockOrder[HotColdIndex]->getFragmentNum() == FragmentNum::main() && 579 BlockOrder[HotColdIndex + 1]->getFragmentNum() == 580 FragmentNum::cold())); 581 582 // Try all possible split indices up to HotColdIndex (blocks that have 583 // Index <= SplitIndex are in hot) and find the one maximizing the 584 // splitting score. 585 SplitScore BestScore; 586 for (size_t Index = 0; Index <= HotColdIndex; Index++) { 587 const BinaryBasicBlock *LastHotBB = BlockOrder[Index]; 588 assert(LastHotBB->getFragmentNum() != FragmentNum::cold()); 589 590 // Do not break jump to the most likely successor. 591 if (Index + 1 < BlockOrder.size() && 592 BlockOrder[Index + 1] == getMostLikelySuccessor(LastHotBB)) 593 continue; 594 595 const SplitScore Score = 596 computeSplitScore(BF, BlockOrder, Index, CoverCalls); 597 if (Score.sum() > BestScore.sum()) 598 BestScore = Score; 599 } 600 601 // If we don't find a good splitting point, fallback to the original one. 602 if (BestScore.SplitIndex == size_t(-1)) 603 return HotColdIndex; 604 605 return BestScore.SplitIndex; 606 } 607 }; 608 609 struct SplitRandom2 final : public SplitStrategy { 610 std::minstd_rand0 Gen; 611 612 SplitRandom2() : Gen(opts::RandomSeed.getValue()) {} 613 614 bool canSplit(const BinaryFunction &BF) override { return true; } 615 616 bool compactFragments() override { return true; } 617 618 void fragment(const BlockIt Start, const BlockIt End) override { 619 using DiffT = typename std::iterator_traits<BlockIt>::difference_type; 620 const DiffT NumBlocks = End - Start; 621 assert(NumBlocks > 0 && "Cannot fragment empty function"); 622 623 // We want to split at least one block 624 const auto LastSplitPoint = std::max<DiffT>(NumBlocks - 1, 1); 625 std::uniform_int_distribution<DiffT> Dist(1, LastSplitPoint); 626 const DiffT SplitPoint = Dist(Gen); 627 for (BinaryBasicBlock *BB : llvm::make_range(Start + SplitPoint, End)) 628 BB->setFragmentNum(FragmentNum::cold()); 629 630 LLVM_DEBUG(dbgs() << formatv("BOLT-DEBUG: randomly chose last {0} (out of " 631 "{1} possible) blocks to split\n", 632 NumBlocks - SplitPoint, End - Start)); 633 } 634 }; 635 636 struct SplitRandomN final : public SplitStrategy { 637 std::minstd_rand0 Gen; 638 639 SplitRandomN() : Gen(opts::RandomSeed.getValue()) {} 640 641 bool canSplit(const BinaryFunction &BF) override { return true; } 642 643 bool compactFragments() override { return true; } 644 645 void fragment(const BlockIt Start, const BlockIt End) override { 646 using DiffT = typename std::iterator_traits<BlockIt>::difference_type; 647 const DiffT NumBlocks = End - Start; 648 assert(NumBlocks > 0 && "Cannot fragment empty function"); 649 650 // With n blocks, there are n-1 places to split them. 651 const DiffT MaximumSplits = NumBlocks - 1; 652 // We want to generate at least two fragment if possible, but if there is 653 // only one block, no splits are possible. 654 const auto MinimumSplits = std::min<DiffT>(MaximumSplits, 1); 655 std::uniform_int_distribution<DiffT> Dist(MinimumSplits, MaximumSplits); 656 // Choose how many splits to perform 657 const DiffT NumSplits = Dist(Gen); 658 659 // Draw split points from a lottery 660 SmallVector<unsigned, 0> Lottery(MaximumSplits); 661 // Start lottery at 1, because there is no meaningful splitpoint before the 662 // first block. 663 std::iota(Lottery.begin(), Lottery.end(), 1u); 664 std::shuffle(Lottery.begin(), Lottery.end(), Gen); 665 Lottery.resize(NumSplits); 666 llvm::sort(Lottery); 667 668 // Add one past the end entry to lottery 669 Lottery.push_back(NumBlocks); 670 671 unsigned LotteryIndex = 0; 672 unsigned BBPos = 0; 673 for (BinaryBasicBlock *const BB : make_range(Start, End)) { 674 // Check whether to start new fragment 675 if (BBPos >= Lottery[LotteryIndex]) 676 ++LotteryIndex; 677 678 // Because LotteryIndex is 0 based and cold fragments are 1 based, we can 679 // use the index to assign fragments. 680 BB->setFragmentNum(FragmentNum(LotteryIndex)); 681 682 ++BBPos; 683 } 684 } 685 }; 686 687 struct SplitAll final : public SplitStrategy { 688 bool canSplit(const BinaryFunction &BF) override { return true; } 689 690 bool compactFragments() override { 691 // Keeping empty fragments allows us to test, that empty fragments do not 692 // generate symbols. 693 return false; 694 } 695 696 void fragment(const BlockIt Start, const BlockIt End) override { 697 unsigned Fragment = 0; 698 for (BinaryBasicBlock *const BB : llvm::make_range(Start, End)) 699 BB->setFragmentNum(FragmentNum(Fragment++)); 700 } 701 }; 702 } // namespace 703 704 namespace llvm { 705 namespace bolt { 706 707 bool SplitFunctions::shouldOptimize(const BinaryFunction &BF) const { 708 // Apply execution count threshold 709 if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold) 710 return false; 711 712 return BinaryFunctionPass::shouldOptimize(BF); 713 } 714 715 void SplitFunctions::runOnFunctions(BinaryContext &BC) { 716 if (!opts::SplitFunctions) 717 return; 718 719 // If split strategy is not CDSplit, then a second run of the pass is not 720 // needed after function reordering. 721 if (BC.HasFinalizedFunctionOrder && 722 opts::SplitStrategy != SplitFunctionsStrategy::CDSplit) 723 return; 724 725 std::unique_ptr<SplitStrategy> Strategy; 726 bool ForceSequential = false; 727 728 switch (opts::SplitStrategy) { 729 case SplitFunctionsStrategy::CDSplit: 730 // CDSplit runs two splitting passes: hot-cold splitting (SplitPrfoile2) 731 // before function reordering and hot-warm-cold splitting 732 // (SplitCacheDirected) after function reordering. 733 if (BC.HasFinalizedFunctionOrder) 734 Strategy = std::make_unique<SplitCacheDirected>(BC); 735 else 736 Strategy = std::make_unique<SplitProfile2>(); 737 opts::AggressiveSplitting = true; 738 BC.HasWarmSection = true; 739 break; 740 case SplitFunctionsStrategy::Profile2: 741 Strategy = std::make_unique<SplitProfile2>(); 742 break; 743 case SplitFunctionsStrategy::Random2: 744 Strategy = std::make_unique<SplitRandom2>(); 745 // If we split functions randomly, we need to ensure that across runs with 746 // the same input, we generate random numbers for each function in the same 747 // order. 748 ForceSequential = true; 749 break; 750 case SplitFunctionsStrategy::RandomN: 751 Strategy = std::make_unique<SplitRandomN>(); 752 ForceSequential = true; 753 break; 754 case SplitFunctionsStrategy::All: 755 Strategy = std::make_unique<SplitAll>(); 756 break; 757 } 758 759 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 760 return !shouldOptimize(BF); 761 }; 762 763 ParallelUtilities::runOnEachFunction( 764 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, 765 [&](BinaryFunction &BF) { splitFunction(BF, *Strategy); }, SkipFunc, 766 "SplitFunctions", ForceSequential); 767 768 if (SplitBytesHot + SplitBytesCold > 0) 769 outs() << "BOLT-INFO: splitting separates " << SplitBytesHot 770 << " hot bytes from " << SplitBytesCold << " cold bytes " 771 << format("(%.2lf%% of split functions is hot).\n", 772 100.0 * SplitBytesHot / (SplitBytesHot + SplitBytesCold)); 773 } 774 775 void SplitFunctions::splitFunction(BinaryFunction &BF, SplitStrategy &S) { 776 if (BF.empty()) 777 return; 778 779 if (!S.canSplit(BF)) 780 return; 781 782 FunctionLayout &Layout = BF.getLayout(); 783 BinaryFunction::BasicBlockOrderType PreSplitLayout(Layout.block_begin(), 784 Layout.block_end()); 785 786 BinaryContext &BC = BF.getBinaryContext(); 787 size_t OriginalHotSize; 788 size_t HotSize; 789 size_t ColdSize; 790 if (BC.isX86()) { 791 std::tie(OriginalHotSize, ColdSize) = BC.calculateEmittedSize(BF); 792 LLVM_DEBUG(dbgs() << "Estimated size for function " << BF 793 << " pre-split is <0x" 794 << Twine::utohexstr(OriginalHotSize) << ", 0x" 795 << Twine::utohexstr(ColdSize) << ">\n"); 796 } 797 798 BinaryFunction::BasicBlockOrderType NewLayout(Layout.block_begin(), 799 Layout.block_end()); 800 // Never outline the first basic block. 801 NewLayout.front()->setCanOutline(false); 802 for (BinaryBasicBlock *const BB : NewLayout) { 803 if (!BB->canOutline()) 804 continue; 805 806 // Do not split extra entry points in aarch64. They can be referred by 807 // using ADRs and when this happens, these blocks cannot be placed far 808 // away due to the limited range in ADR instruction. 809 if (BC.isAArch64() && BB->isEntryPoint()) { 810 BB->setCanOutline(false); 811 continue; 812 } 813 814 if (BF.hasEHRanges() && !opts::SplitEH) { 815 // We cannot move landing pads (or rather entry points for landing pads). 816 if (BB->isLandingPad()) { 817 BB->setCanOutline(false); 818 continue; 819 } 820 // We cannot move a block that can throw since exception-handling 821 // runtime cannot deal with split functions. However, if we can guarantee 822 // that the block never throws, it is safe to move the block to 823 // decrease the size of the function. 824 for (MCInst &Instr : *BB) { 825 if (BC.MIB->isInvoke(Instr)) { 826 BB->setCanOutline(false); 827 break; 828 } 829 } 830 } 831 } 832 833 BF.getLayout().updateLayoutIndices(); 834 S.fragment(NewLayout.begin(), NewLayout.end()); 835 836 // Make sure all non-outlineable blocks are in the main-fragment. 837 for (BinaryBasicBlock *const BB : NewLayout) { 838 if (!BB->canOutline()) 839 BB->setFragmentNum(FragmentNum::main()); 840 } 841 842 if (opts::AggressiveSplitting) { 843 // All blocks with 0 count that we can move go to the end of the function. 844 // Even if they were natural to cluster formation and were seen in-between 845 // hot basic blocks. 846 llvm::stable_sort(NewLayout, [&](const BinaryBasicBlock *const A, 847 const BinaryBasicBlock *const B) { 848 return A->getFragmentNum() < B->getFragmentNum(); 849 }); 850 } else if (BF.hasEHRanges() && !opts::SplitEH) { 851 // Typically functions with exception handling have landing pads at the end. 852 // We cannot move beginning of landing pads, but we can move 0-count blocks 853 // comprising landing pads to the end and thus facilitate splitting. 854 auto FirstLP = NewLayout.begin(); 855 while ((*FirstLP)->isLandingPad()) 856 ++FirstLP; 857 858 std::stable_sort(FirstLP, NewLayout.end(), 859 [&](BinaryBasicBlock *A, BinaryBasicBlock *B) { 860 return A->getFragmentNum() < B->getFragmentNum(); 861 }); 862 } 863 864 // Make sure that fragments are increasing. 865 FragmentNum CurrentFragment = NewLayout.back()->getFragmentNum(); 866 for (BinaryBasicBlock *const BB : reverse(NewLayout)) { 867 if (BB->getFragmentNum() > CurrentFragment) 868 BB->setFragmentNum(CurrentFragment); 869 CurrentFragment = BB->getFragmentNum(); 870 } 871 872 if (S.compactFragments()) { 873 FragmentNum CurrentFragment = FragmentNum::main(); 874 FragmentNum NewFragment = FragmentNum::main(); 875 for (BinaryBasicBlock *const BB : NewLayout) { 876 if (BB->getFragmentNum() > CurrentFragment) { 877 CurrentFragment = BB->getFragmentNum(); 878 NewFragment = FragmentNum(NewFragment.get() + 1); 879 } 880 BB->setFragmentNum(NewFragment); 881 } 882 } 883 884 const bool LayoutUpdated = BF.getLayout().update(NewLayout); 885 886 // For shared objects, invoke instructions and corresponding landing pads 887 // have to be placed in the same fragment. When we split them, create 888 // trampoline landing pads that will redirect the execution to real LPs. 889 TrampolineSetType Trampolines; 890 if (!BC.HasFixedLoadAddress && BF.hasEHRanges() && BF.isSplit()) 891 Trampolines = createEHTrampolines(BF); 892 893 // Check the new size to see if it's worth splitting the function. 894 if (BC.isX86() && LayoutUpdated) { 895 std::tie(HotSize, ColdSize) = BC.calculateEmittedSize(BF); 896 LLVM_DEBUG(dbgs() << "Estimated size for function " << BF 897 << " post-split is <0x" << Twine::utohexstr(HotSize) 898 << ", 0x" << Twine::utohexstr(ColdSize) << ">\n"); 899 if (alignTo(OriginalHotSize, opts::SplitAlignThreshold) <= 900 alignTo(HotSize, opts::SplitAlignThreshold) + opts::SplitThreshold) { 901 if (opts::Verbosity >= 2) { 902 outs() << "BOLT-INFO: Reversing splitting of function " 903 << formatv("{0}:\n {1:x}, {2:x} -> {3:x}\n", BF, HotSize, 904 ColdSize, OriginalHotSize); 905 } 906 907 // Reverse the action of createEHTrampolines(). The trampolines will be 908 // placed immediately before the matching destination resulting in no 909 // extra code. 910 if (PreSplitLayout.size() != BF.size()) 911 PreSplitLayout = mergeEHTrampolines(BF, PreSplitLayout, Trampolines); 912 913 for (BinaryBasicBlock &BB : BF) 914 BB.setFragmentNum(FragmentNum::main()); 915 BF.getLayout().update(PreSplitLayout); 916 } else { 917 SplitBytesHot += HotSize; 918 SplitBytesCold += ColdSize; 919 } 920 } 921 922 // Fix branches if the splitting decision of the pass after function 923 // reordering is different from that of the pass before function reordering. 924 if (LayoutUpdated && BC.HasFinalizedFunctionOrder) 925 BF.fixBranches(); 926 } 927 928 SplitFunctions::TrampolineSetType 929 SplitFunctions::createEHTrampolines(BinaryFunction &BF) const { 930 const auto &MIB = BF.getBinaryContext().MIB; 931 932 // Map real landing pads to the corresponding trampolines. 933 TrampolineSetType LPTrampolines; 934 935 // Iterate over the copy of basic blocks since we are adding new blocks to the 936 // function which will invalidate its iterators. 937 std::vector<BinaryBasicBlock *> Blocks(BF.pbegin(), BF.pend()); 938 for (BinaryBasicBlock *BB : Blocks) { 939 for (MCInst &Instr : *BB) { 940 const std::optional<MCPlus::MCLandingPad> EHInfo = MIB->getEHInfo(Instr); 941 if (!EHInfo || !EHInfo->first) 942 continue; 943 944 const MCSymbol *LPLabel = EHInfo->first; 945 BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(LPLabel); 946 if (BB->getFragmentNum() == LPBlock->getFragmentNum()) 947 continue; 948 949 const MCSymbol *TrampolineLabel = nullptr; 950 const TrampolineKey Key(BB->getFragmentNum(), LPLabel); 951 auto Iter = LPTrampolines.find(Key); 952 if (Iter != LPTrampolines.end()) { 953 TrampolineLabel = Iter->second; 954 } else { 955 // Create a trampoline basic block in the same fragment as the thrower. 956 // Note: there's no need to insert the jump instruction, it will be 957 // added by fixBranches(). 958 BinaryBasicBlock *TrampolineBB = BF.addBasicBlock(); 959 TrampolineBB->setFragmentNum(BB->getFragmentNum()); 960 TrampolineBB->setExecutionCount(LPBlock->getExecutionCount()); 961 TrampolineBB->addSuccessor(LPBlock, TrampolineBB->getExecutionCount()); 962 TrampolineBB->setCFIState(LPBlock->getCFIState()); 963 TrampolineLabel = TrampolineBB->getLabel(); 964 LPTrampolines.insert(std::make_pair(Key, TrampolineLabel)); 965 } 966 967 // Substitute the landing pad with the trampoline. 968 MIB->updateEHInfo(Instr, 969 MCPlus::MCLandingPad(TrampolineLabel, EHInfo->second)); 970 } 971 } 972 973 if (LPTrampolines.empty()) 974 return LPTrampolines; 975 976 // All trampoline blocks were added to the end of the function. Place them at 977 // the end of corresponding fragments. 978 BinaryFunction::BasicBlockOrderType NewLayout(BF.getLayout().block_begin(), 979 BF.getLayout().block_end()); 980 stable_sort(NewLayout, [&](BinaryBasicBlock *A, BinaryBasicBlock *B) { 981 return A->getFragmentNum() < B->getFragmentNum(); 982 }); 983 BF.getLayout().update(NewLayout); 984 985 // Conservatively introduce branch instructions. 986 BF.fixBranches(); 987 988 // Update exception-handling CFG for the function. 989 BF.recomputeLandingPads(); 990 991 return LPTrampolines; 992 } 993 994 SplitFunctions::BasicBlockOrderType SplitFunctions::mergeEHTrampolines( 995 BinaryFunction &BF, SplitFunctions::BasicBlockOrderType &Layout, 996 const SplitFunctions::TrampolineSetType &Trampolines) const { 997 DenseMap<const MCSymbol *, SmallVector<const MCSymbol *, 0>> 998 IncomingTrampolines; 999 for (const auto &Entry : Trampolines) { 1000 IncomingTrampolines[Entry.getFirst().Target].emplace_back( 1001 Entry.getSecond()); 1002 } 1003 1004 BasicBlockOrderType MergedLayout; 1005 for (BinaryBasicBlock *BB : Layout) { 1006 auto Iter = IncomingTrampolines.find(BB->getLabel()); 1007 if (Iter != IncomingTrampolines.end()) { 1008 for (const MCSymbol *const Trampoline : Iter->getSecond()) { 1009 BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(Trampoline); 1010 assert(LPBlock && "Could not find matching landing pad block."); 1011 MergedLayout.push_back(LPBlock); 1012 } 1013 } 1014 MergedLayout.push_back(BB); 1015 } 1016 1017 return MergedLayout; 1018 } 1019 1020 } // namespace bolt 1021 } // namespace llvm 1022