1 //===- bolt/Passes/SplitFunctions.cpp - Pass for splitting function code --===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the SplitFunctions pass. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "bolt/Passes/SplitFunctions.h" 14 #include "bolt/Core/BinaryBasicBlock.h" 15 #include "bolt/Core/BinaryFunction.h" 16 #include "bolt/Core/FunctionLayout.h" 17 #include "bolt/Core/ParallelUtilities.h" 18 #include "bolt/Utils/CommandLineOpts.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/ADT/iterator_range.h" 22 #include "llvm/Support/CommandLine.h" 23 #include "llvm/Support/FormatVariadic.h" 24 #include <algorithm> 25 #include <iterator> 26 #include <memory> 27 #include <numeric> 28 #include <random> 29 #include <vector> 30 31 #define DEBUG_TYPE "bolt-opts" 32 33 using namespace llvm; 34 using namespace bolt; 35 36 namespace { 37 class DeprecatedSplitFunctionOptionParser : public cl::parser<bool> { 38 public: 39 explicit DeprecatedSplitFunctionOptionParser(cl::Option &O) 40 : cl::parser<bool>(O) {} 41 42 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, bool &Value) { 43 if (Arg == "2" || Arg == "3") { 44 Value = true; 45 errs() << formatv("BOLT-WARNING: specifying non-boolean value \"{0}\" " 46 "for option -{1} is deprecated\n", 47 Arg, ArgName); 48 return false; 49 } 50 return cl::parser<bool>::parse(O, ArgName, Arg, Value); 51 } 52 }; 53 } // namespace 54 55 namespace opts { 56 57 extern cl::OptionCategory BoltOptCategory; 58 59 extern cl::opt<bool> SplitEH; 60 extern cl::opt<unsigned> ExecutionCountThreshold; 61 extern cl::opt<uint32_t> RandomSeed; 62 63 static cl::opt<bool> AggressiveSplitting( 64 "split-all-cold", cl::desc("outline as many cold basic blocks as possible"), 65 cl::cat(BoltOptCategory)); 66 67 static cl::opt<unsigned> SplitAlignThreshold( 68 "split-align-threshold", 69 cl::desc("when deciding to split a function, apply this alignment " 70 "while doing the size comparison (see -split-threshold). " 71 "Default value: 2."), 72 cl::init(2), 73 74 cl::Hidden, cl::cat(BoltOptCategory)); 75 76 static cl::opt<bool, false, DeprecatedSplitFunctionOptionParser> 77 SplitFunctions("split-functions", 78 cl::desc("split functions into fragments"), 79 cl::cat(BoltOptCategory)); 80 81 static cl::opt<unsigned> SplitThreshold( 82 "split-threshold", 83 cl::desc("split function only if its main size is reduced by more than " 84 "given amount of bytes. Default value: 0, i.e. split iff the " 85 "size is reduced. Note that on some architectures the size can " 86 "increase after splitting."), 87 cl::init(0), cl::Hidden, cl::cat(BoltOptCategory)); 88 89 static cl::opt<SplitFunctionsStrategy> SplitStrategy( 90 "split-strategy", cl::init(SplitFunctionsStrategy::Profile2), 91 cl::values(clEnumValN(SplitFunctionsStrategy::Profile2, "profile2", 92 "split each function into a hot and cold fragment " 93 "using profiling information")), 94 cl::values(clEnumValN(SplitFunctionsStrategy::CDSplit, "cdsplit", 95 "split each function into a hot, warm, and cold " 96 "fragment using profiling information")), 97 cl::values(clEnumValN( 98 SplitFunctionsStrategy::Random2, "random2", 99 "split each function into a hot and cold fragment at a randomly chosen " 100 "split point (ignoring any available profiling information)")), 101 cl::values(clEnumValN( 102 SplitFunctionsStrategy::RandomN, "randomN", 103 "split each function into N fragments at a randomly chosen split " 104 "points (ignoring any available profiling information)")), 105 cl::values(clEnumValN( 106 SplitFunctionsStrategy::All, "all", 107 "split all basic blocks of each function into fragments such that each " 108 "fragment contains exactly a single basic block")), 109 cl::desc("strategy used to partition blocks into fragments"), 110 cl::cat(BoltOptCategory)); 111 112 static cl::opt<double> CallScale( 113 "call-scale", 114 cl::desc("Call score scale coefficient (when --split-strategy=cdsplit)"), 115 cl::init(0.95), cl::ReallyHidden, cl::cat(BoltOptCategory)); 116 117 static cl::opt<double> 118 CallPower("call-power", 119 cl::desc("Call score power (when --split-strategy=cdsplit)"), 120 cl::init(0.05), cl::ReallyHidden, cl::cat(BoltOptCategory)); 121 122 static cl::opt<double> 123 JumpPower("jump-power", 124 cl::desc("Jump score power (when --split-strategy=cdsplit)"), 125 cl::init(0.15), cl::ReallyHidden, cl::cat(BoltOptCategory)); 126 } // namespace opts 127 128 namespace { 129 bool hasFullProfile(const BinaryFunction &BF) { 130 return llvm::all_of(BF.blocks(), [](const BinaryBasicBlock &BB) { 131 return BB.getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE; 132 }); 133 } 134 135 bool allBlocksCold(const BinaryFunction &BF) { 136 return llvm::all_of(BF.blocks(), [](const BinaryBasicBlock &BB) { 137 return BB.getExecutionCount() == 0; 138 }); 139 } 140 141 struct SplitProfile2 final : public SplitStrategy { 142 bool canSplit(const BinaryFunction &BF) override { 143 return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF); 144 } 145 146 bool compactFragments() override { return true; } 147 148 void fragment(const BlockIt Start, const BlockIt End) override { 149 for (BinaryBasicBlock *const BB : llvm::make_range(Start, End)) { 150 if (BB->getExecutionCount() == 0) 151 BB->setFragmentNum(FragmentNum::cold()); 152 } 153 } 154 }; 155 156 struct SplitCacheDirected final : public SplitStrategy { 157 BinaryContext &BC; 158 using BasicBlockOrder = BinaryFunction::BasicBlockOrderType; 159 160 bool canSplit(const BinaryFunction &BF) override { 161 return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF); 162 } 163 164 explicit SplitCacheDirected(BinaryContext &BC) : BC(BC) { 165 initializeAuxiliaryVariables(); 166 buildCallGraph(); 167 } 168 169 // When some functions are hot-warm split and others are hot-warm-cold split, 170 // we do not want to change the fragment numbers of the blocks in the hot-warm 171 // split functions. 172 bool compactFragments() override { return false; } 173 174 void fragment(const BlockIt Start, const BlockIt End) override { 175 BasicBlockOrder BlockOrder(Start, End); 176 BinaryFunction &BF = *BlockOrder.front()->getFunction(); 177 // No need to re-split small functions. 178 if (BlockOrder.size() <= 2) 179 return; 180 181 size_t BestSplitIndex = findSplitIndex(BF, BlockOrder); 182 assert(BestSplitIndex < BlockOrder.size()); 183 184 // Assign fragments based on the computed best split index. 185 // All basic blocks with index up to the best split index become hot. 186 // All remaining blocks are warm / cold depending on if count is 187 // greater than zero or not. 188 for (size_t Index = 0; Index < BlockOrder.size(); Index++) { 189 BinaryBasicBlock *BB = BlockOrder[Index]; 190 if (Index <= BestSplitIndex) 191 BB->setFragmentNum(FragmentNum::main()); 192 else 193 BB->setFragmentNum(BB->getKnownExecutionCount() > 0 194 ? FragmentNum::warm() 195 : FragmentNum::cold()); 196 } 197 } 198 199 private: 200 struct CallInfo { 201 size_t Length; 202 size_t Count; 203 }; 204 205 struct SplitScore { 206 size_t SplitIndex = size_t(-1); 207 size_t HotSizeReduction = 0; 208 double LocalScore = 0; 209 double CoverCallScore = 0; 210 211 double sum() const { return LocalScore + CoverCallScore; } 212 }; 213 214 // Auxiliary variables used by the algorithm. 215 size_t TotalNumBlocks{0}; 216 size_t OrigHotSectionSize{0}; 217 DenseMap<const BinaryBasicBlock *, size_t> GlobalIndices; 218 DenseMap<const BinaryBasicBlock *, size_t> BBSizes; 219 DenseMap<const BinaryBasicBlock *, size_t> BBOffsets; 220 221 // Call graph. 222 std::vector<SmallVector<const BinaryBasicBlock *, 0>> Callers; 223 std::vector<SmallVector<const BinaryBasicBlock *, 0>> Callees; 224 225 bool shouldConsiderForCallGraph(const BinaryFunction &BF) { 226 // Only a subset of the functions in the binary will be considered 227 // for initializing auxiliary variables and building call graph. 228 return BF.hasValidIndex() && BF.hasValidProfile() && !BF.empty(); 229 } 230 231 void initializeAuxiliaryVariables() { 232 for (BinaryFunction *BF : BC.getSortedFunctions()) { 233 if (!shouldConsiderForCallGraph(*BF)) 234 continue; 235 236 // Calculate the size of each BB after hot-cold splitting. 237 // This populates BinaryBasicBlock::OutputAddressRange which 238 // can be used to compute the size of each BB. 239 BC.calculateEmittedSize(*BF, /*FixBranches=*/true); 240 241 for (BinaryBasicBlock *BB : BF->getLayout().blocks()) { 242 // Unique global index. 243 GlobalIndices[BB] = TotalNumBlocks; 244 TotalNumBlocks++; 245 246 // Block size after hot-cold splitting. 247 BBSizes[BB] = BB->getOutputSize(); 248 249 // Hot block offset after hot-cold splitting. 250 BBOffsets[BB] = OrigHotSectionSize; 251 if (!BB->isSplit()) 252 OrigHotSectionSize += BBSizes[BB]; 253 } 254 } 255 } 256 257 void buildCallGraph() { 258 Callers.resize(TotalNumBlocks); 259 Callees.resize(TotalNumBlocks); 260 for (const BinaryFunction *SrcFunction : BC.getSortedFunctions()) { 261 if (!shouldConsiderForCallGraph(*SrcFunction)) 262 continue; 263 264 for (BinaryBasicBlock &SrcBB : SrcFunction->blocks()) { 265 // Skip blocks that are not executed 266 if (SrcBB.getKnownExecutionCount() == 0) 267 continue; 268 269 // Find call instructions and extract target symbols from each one 270 for (const MCInst &Inst : SrcBB) { 271 if (!BC.MIB->isCall(Inst)) 272 continue; 273 274 // Call info 275 const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst); 276 // Ignore calls w/o information 277 if (!DstSym) 278 continue; 279 280 const BinaryFunction *DstFunction = BC.getFunctionForSymbol(DstSym); 281 // Ignore calls that do not have a valid target, but do not ignore 282 // recursive calls, because caller block could be moved to warm. 283 if (!DstFunction || DstFunction->getLayout().block_empty()) 284 continue; 285 286 const BinaryBasicBlock *DstBB = &(DstFunction->front()); 287 288 // Record the call only if DstBB is also in functions to consider for 289 // call graph. 290 if (GlobalIndices.contains(DstBB)) { 291 Callers[GlobalIndices[DstBB]].push_back(&SrcBB); 292 Callees[GlobalIndices[&SrcBB]].push_back(DstBB); 293 } 294 } 295 } 296 } 297 } 298 299 /// Populate BinaryBasicBlock::OutputAddressRange with estimated basic block 300 /// start and end addresses for hot and warm basic blocks, assuming hot-warm 301 /// splitting happens at \p SplitIndex. Also return estimated end addresses 302 /// of the hot fragment before and after splitting. 303 /// The estimations take into account the potential addition of branch 304 /// instructions due to split fall through branches as well as the need to 305 /// use longer branch instructions for split (un)conditional branches. 306 std::pair<size_t, size_t> 307 estimatePostSplitBBAddress(const BasicBlockOrder &BlockOrder, 308 const size_t SplitIndex) { 309 assert(SplitIndex < BlockOrder.size() && "Invalid split index"); 310 311 // Update function layout assuming hot-warm splitting at SplitIndex. 312 for (size_t Index = 0; Index < BlockOrder.size(); Index++) { 313 BinaryBasicBlock *BB = BlockOrder[Index]; 314 if (BB->getFragmentNum() == FragmentNum::cold()) 315 break; 316 BB->setFragmentNum(Index <= SplitIndex ? FragmentNum::main() 317 : FragmentNum::warm()); 318 } 319 BinaryFunction *BF = BlockOrder[0]->getFunction(); 320 BF->getLayout().update(BlockOrder); 321 // Populate BB.OutputAddressRange under the updated layout. 322 BC.calculateEmittedSize(*BF); 323 324 // Populate BB.OutputAddressRange with estimated new start and end addresses 325 // and compute the old end address of the hot section and the new end 326 // address of the hot section. 327 size_t OldHotEndAddr{0}; 328 size_t NewHotEndAddr{0}; 329 size_t CurrentAddr = BBOffsets[BlockOrder[0]]; 330 for (BinaryBasicBlock *BB : BlockOrder) { 331 // We only care about new addresses of blocks in hot/warm. 332 if (BB->getFragmentNum() == FragmentNum::cold()) 333 break; 334 const size_t NewSize = BB->getOutputSize(); 335 BB->setOutputStartAddress(CurrentAddr); 336 CurrentAddr += NewSize; 337 BB->setOutputEndAddress(CurrentAddr); 338 if (BB->getLayoutIndex() == SplitIndex) { 339 NewHotEndAddr = CurrentAddr; 340 // Approximate the start address of the warm fragment of the current 341 // function using the original hot section size. 342 CurrentAddr = OrigHotSectionSize; 343 } 344 OldHotEndAddr = BBOffsets[BB] + BBSizes[BB]; 345 } 346 return std::make_pair(OldHotEndAddr, NewHotEndAddr); 347 } 348 349 /// Get a collection of "shortenable" calls, that is, calls of type X->Y 350 /// when the function order is [... X ... BF ... Y ...]. 351 /// If the hot fragment size of BF is reduced, then such calls are guaranteed 352 /// to get shorter by the reduced hot fragment size. 353 std::vector<CallInfo> extractCoverCalls(const BinaryFunction &BF) { 354 // Record the length and the count of the calls that can be shortened 355 std::vector<CallInfo> CoverCalls; 356 if (opts::CallScale == 0) 357 return CoverCalls; 358 359 const BinaryFunction *ThisBF = &BF; 360 const BinaryBasicBlock *ThisBB = &(ThisBF->front()); 361 const size_t ThisGI = GlobalIndices[ThisBB]; 362 363 for (const BinaryFunction *DstBF : BC.getSortedFunctions()) { 364 if (!shouldConsiderForCallGraph(*DstBF)) 365 continue; 366 367 const BinaryBasicBlock *DstBB = &(DstBF->front()); 368 if (DstBB->getKnownExecutionCount() == 0) 369 continue; 370 371 const size_t DstGI = GlobalIndices[DstBB]; 372 for (const BinaryBasicBlock *SrcBB : Callers[DstGI]) { 373 const BinaryFunction *SrcBF = SrcBB->getFunction(); 374 if (ThisBF == SrcBF) 375 continue; 376 377 const size_t CallCount = SrcBB->getKnownExecutionCount(); 378 379 const size_t SrcGI = GlobalIndices[SrcBB]; 380 381 const bool IsCoverCall = (SrcGI < ThisGI && ThisGI < DstGI) || 382 (DstGI <= ThisGI && ThisGI < SrcGI); 383 if (!IsCoverCall) 384 continue; 385 386 const size_t SrcBBEndAddr = BBOffsets[SrcBB] + BBSizes[SrcBB]; 387 const size_t DstBBStartAddr = BBOffsets[DstBB]; 388 const size_t CallLength = 389 AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr); 390 const CallInfo CI{CallLength, CallCount}; 391 CoverCalls.emplace_back(CI); 392 } 393 } 394 return CoverCalls; 395 } 396 397 /// Compute the edge score of a call edge. 398 double computeCallScore(uint64_t CallCount, size_t CallLength) { 399 // Increase call lengths by 1 to avoid raising 0 to a negative power. 400 return opts::CallScale * static_cast<double>(CallCount) / 401 std::pow(static_cast<double>(CallLength + 1), opts::CallPower); 402 } 403 404 /// Compute the edge score of a jump (branch) edge. 405 double computeJumpScore(uint64_t JumpCount, size_t JumpLength) { 406 // Increase jump lengths by 1 to avoid raising 0 to a negative power. 407 return static_cast<double>(JumpCount) / 408 std::pow(static_cast<double>(JumpLength + 1), opts::JumpPower); 409 } 410 411 /// Compute sum of scores over jumps within \p BlockOrder given \p SplitIndex. 412 /// Increament Score.LocalScore in place by the sum. 413 void computeJumpScore(const BasicBlockOrder &BlockOrder, 414 const size_t SplitIndex, SplitScore &Score) { 415 416 for (const BinaryBasicBlock *SrcBB : BlockOrder) { 417 if (SrcBB->getKnownExecutionCount() == 0) 418 continue; 419 420 const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second; 421 422 for (const auto Pair : zip(SrcBB->successors(), SrcBB->branch_info())) { 423 const BinaryBasicBlock *DstBB = std::get<0>(Pair); 424 const BinaryBasicBlock::BinaryBranchInfo &Branch = std::get<1>(Pair); 425 const size_t JumpCount = Branch.Count; 426 427 if (JumpCount == 0) 428 continue; 429 430 const size_t DstBBStartAddr = DstBB->getOutputAddressRange().first; 431 const size_t NewJumpLength = 432 AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr); 433 Score.LocalScore += computeJumpScore(JumpCount, NewJumpLength); 434 } 435 } 436 } 437 438 /// Compute sum of scores over calls originated in the current function 439 /// given \p SplitIndex. Increament Score.LocalScore in place by the sum. 440 void computeLocalCallScore(const BasicBlockOrder &BlockOrder, 441 const size_t SplitIndex, SplitScore &Score) { 442 if (opts::CallScale == 0) 443 return; 444 445 // Global index of the last block in the current function. 446 // This is later used to determine whether a call originated in the current 447 // function is to a function that comes after the current function. 448 const size_t LastGlobalIndex = GlobalIndices[BlockOrder.back()]; 449 450 // The length of calls originated in the input function can increase / 451 // decrease depending on the splitting decision. 452 for (const BinaryBasicBlock *SrcBB : BlockOrder) { 453 const size_t CallCount = SrcBB->getKnownExecutionCount(); 454 // If SrcBB does not call any functions, skip it. 455 if (CallCount == 0) 456 continue; 457 458 // Obtain an estimate on the end address of the src basic block 459 // after splitting at SplitIndex. 460 const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second; 461 462 for (const BinaryBasicBlock *DstBB : Callees[GlobalIndices[SrcBB]]) { 463 // Obtain an estimate on the start address of the dst basic block 464 // after splitting at SplitIndex. If DstBB is in a function before 465 // the current function, then its start address remains unchanged. 466 size_t DstBBStartAddr = BBOffsets[DstBB]; 467 // If DstBB is in a function after the current function, then its 468 // start address should be adjusted based on the reduction in hot size. 469 if (GlobalIndices[DstBB] > LastGlobalIndex) { 470 assert(DstBBStartAddr >= Score.HotSizeReduction); 471 DstBBStartAddr -= Score.HotSizeReduction; 472 } 473 const size_t NewCallLength = 474 AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr); 475 Score.LocalScore += computeCallScore(CallCount, NewCallLength); 476 } 477 } 478 } 479 480 /// Compute sum of splitting scores for cover calls of the input function. 481 /// Increament Score.CoverCallScore in place by the sum. 482 void computeCoverCallScore(const BasicBlockOrder &BlockOrder, 483 const size_t SplitIndex, 484 const std::vector<CallInfo> &CoverCalls, 485 SplitScore &Score) { 486 if (opts::CallScale == 0) 487 return; 488 489 for (const CallInfo CI : CoverCalls) { 490 assert(CI.Length >= Score.HotSizeReduction && 491 "Length of cover calls must exceed reduced size of hot fragment."); 492 // Compute the new length of the call, which is shorter than the original 493 // one by the size of the splitted fragment minus the total size increase. 494 const size_t NewCallLength = CI.Length - Score.HotSizeReduction; 495 Score.CoverCallScore += computeCallScore(CI.Count, NewCallLength); 496 } 497 } 498 499 /// Compute the split score of splitting a function at a given index. 500 /// The split score consists of local score and cover score. This function 501 /// returns \p Score of SplitScore type. It contains the local score and 502 /// cover score of the current splitting index. For easier book keeping and 503 /// comparison, it also stores the split index and the resulting reduction 504 /// in hot fragment size. 505 SplitScore computeSplitScore(const BinaryFunction &BF, 506 const BasicBlockOrder &BlockOrder, 507 const size_t SplitIndex, 508 const std::vector<CallInfo> &CoverCalls) { 509 // Populate BinaryBasicBlock::OutputAddressRange with estimated 510 // new start and end addresses after hot-warm splitting at SplitIndex. 511 size_t OldHotEnd; 512 size_t NewHotEnd; 513 std::tie(OldHotEnd, NewHotEnd) = 514 estimatePostSplitBBAddress(BlockOrder, SplitIndex); 515 516 SplitScore Score; 517 Score.SplitIndex = SplitIndex; 518 519 // It's not worth splitting if OldHotEnd < NewHotEnd. 520 if (OldHotEnd < NewHotEnd) 521 return Score; 522 523 // Hot fragment size reduction due to splitting. 524 Score.HotSizeReduction = OldHotEnd - NewHotEnd; 525 526 // First part of LocalScore is the sum over call edges originated in the 527 // input function. These edges can get shorter or longer depending on 528 // SplitIndex. Score.LocalScore is increamented in place. 529 computeLocalCallScore(BlockOrder, SplitIndex, Score); 530 531 // Second part of LocalScore is the sum over jump edges with src basic block 532 // and dst basic block in the current function. Score.LocalScore is 533 // increamented in place. 534 computeJumpScore(BlockOrder, SplitIndex, Score); 535 536 // Compute CoverCallScore and store in Score in place. 537 computeCoverCallScore(BlockOrder, SplitIndex, CoverCalls, Score); 538 return Score; 539 } 540 541 /// Find the most likely successor of a basic block when it has one or two 542 /// successors. Return nullptr otherwise. 543 const BinaryBasicBlock *getMostLikelySuccessor(const BinaryBasicBlock *BB) { 544 if (BB->succ_size() == 1) 545 return BB->getSuccessor(); 546 if (BB->succ_size() == 2) { 547 uint64_t TakenCount = BB->getTakenBranchInfo().Count; 548 assert(TakenCount != BinaryBasicBlock::COUNT_NO_PROFILE); 549 uint64_t NonTakenCount = BB->getFallthroughBranchInfo().Count; 550 assert(NonTakenCount != BinaryBasicBlock::COUNT_NO_PROFILE); 551 if (TakenCount > NonTakenCount) 552 return BB->getConditionalSuccessor(true); 553 else if (TakenCount < NonTakenCount) 554 return BB->getConditionalSuccessor(false); 555 } 556 return nullptr; 557 } 558 559 /// Find the best index for splitting. The returned value is the index of the 560 /// last hot basic block. Hence, "no splitting" is equivalent to returning the 561 /// value which is one less than the size of the function. 562 size_t findSplitIndex(const BinaryFunction &BF, 563 const BasicBlockOrder &BlockOrder) { 564 assert(BlockOrder.size() > 2); 565 // Find all function calls that can be shortened if we move blocks of the 566 // current function to warm/cold 567 const std::vector<CallInfo> CoverCalls = extractCoverCalls(BF); 568 569 // Find the existing hot-cold splitting index. 570 size_t HotColdIndex = 0; 571 while (HotColdIndex + 1 < BlockOrder.size()) { 572 if (BlockOrder[HotColdIndex + 1]->getFragmentNum() == FragmentNum::cold()) 573 break; 574 HotColdIndex++; 575 } 576 assert(HotColdIndex + 1 == BlockOrder.size() || 577 (BlockOrder[HotColdIndex]->getFragmentNum() == FragmentNum::main() && 578 BlockOrder[HotColdIndex + 1]->getFragmentNum() == 579 FragmentNum::cold())); 580 581 // Try all possible split indices up to HotColdIndex (blocks that have 582 // Index <= SplitIndex are in hot) and find the one maximizing the 583 // splitting score. 584 SplitScore BestScore; 585 for (size_t Index = 0; Index <= HotColdIndex; Index++) { 586 const BinaryBasicBlock *LastHotBB = BlockOrder[Index]; 587 assert(LastHotBB->getFragmentNum() != FragmentNum::cold()); 588 589 // Do not break jump to the most likely successor. 590 if (Index + 1 < BlockOrder.size() && 591 BlockOrder[Index + 1] == getMostLikelySuccessor(LastHotBB)) 592 continue; 593 594 const SplitScore Score = 595 computeSplitScore(BF, BlockOrder, Index, CoverCalls); 596 if (Score.sum() > BestScore.sum()) 597 BestScore = Score; 598 } 599 600 // If we don't find a good splitting point, fallback to the original one. 601 if (BestScore.SplitIndex == size_t(-1)) 602 return HotColdIndex; 603 604 return BestScore.SplitIndex; 605 } 606 }; 607 608 struct SplitRandom2 final : public SplitStrategy { 609 std::minstd_rand0 Gen; 610 611 SplitRandom2() : Gen(opts::RandomSeed.getValue()) {} 612 613 bool canSplit(const BinaryFunction &BF) override { return true; } 614 615 bool compactFragments() override { return true; } 616 617 void fragment(const BlockIt Start, const BlockIt End) override { 618 using DiffT = typename std::iterator_traits<BlockIt>::difference_type; 619 const DiffT NumBlocks = End - Start; 620 assert(NumBlocks > 0 && "Cannot fragment empty function"); 621 622 // We want to split at least one block 623 const auto LastSplitPoint = std::max<DiffT>(NumBlocks - 1, 1); 624 std::uniform_int_distribution<DiffT> Dist(1, LastSplitPoint); 625 const DiffT SplitPoint = Dist(Gen); 626 for (BinaryBasicBlock *BB : llvm::make_range(Start + SplitPoint, End)) 627 BB->setFragmentNum(FragmentNum::cold()); 628 629 LLVM_DEBUG(dbgs() << formatv("BOLT-DEBUG: randomly chose last {0} (out of " 630 "{1} possible) blocks to split\n", 631 NumBlocks - SplitPoint, End - Start)); 632 } 633 }; 634 635 struct SplitRandomN final : public SplitStrategy { 636 std::minstd_rand0 Gen; 637 638 SplitRandomN() : Gen(opts::RandomSeed.getValue()) {} 639 640 bool canSplit(const BinaryFunction &BF) override { return true; } 641 642 bool compactFragments() override { return true; } 643 644 void fragment(const BlockIt Start, const BlockIt End) override { 645 using DiffT = typename std::iterator_traits<BlockIt>::difference_type; 646 const DiffT NumBlocks = End - Start; 647 assert(NumBlocks > 0 && "Cannot fragment empty function"); 648 649 // With n blocks, there are n-1 places to split them. 650 const DiffT MaximumSplits = NumBlocks - 1; 651 // We want to generate at least two fragment if possible, but if there is 652 // only one block, no splits are possible. 653 const auto MinimumSplits = std::min<DiffT>(MaximumSplits, 1); 654 std::uniform_int_distribution<DiffT> Dist(MinimumSplits, MaximumSplits); 655 // Choose how many splits to perform 656 const DiffT NumSplits = Dist(Gen); 657 658 // Draw split points from a lottery 659 SmallVector<unsigned, 0> Lottery(MaximumSplits); 660 // Start lottery at 1, because there is no meaningful splitpoint before the 661 // first block. 662 std::iota(Lottery.begin(), Lottery.end(), 1u); 663 std::shuffle(Lottery.begin(), Lottery.end(), Gen); 664 Lottery.resize(NumSplits); 665 llvm::sort(Lottery); 666 667 // Add one past the end entry to lottery 668 Lottery.push_back(NumBlocks); 669 670 unsigned LotteryIndex = 0; 671 unsigned BBPos = 0; 672 for (BinaryBasicBlock *const BB : make_range(Start, End)) { 673 // Check whether to start new fragment 674 if (BBPos >= Lottery[LotteryIndex]) 675 ++LotteryIndex; 676 677 // Because LotteryIndex is 0 based and cold fragments are 1 based, we can 678 // use the index to assign fragments. 679 BB->setFragmentNum(FragmentNum(LotteryIndex)); 680 681 ++BBPos; 682 } 683 } 684 }; 685 686 struct SplitAll final : public SplitStrategy { 687 bool canSplit(const BinaryFunction &BF) override { return true; } 688 689 bool compactFragments() override { 690 // Keeping empty fragments allows us to test, that empty fragments do not 691 // generate symbols. 692 return false; 693 } 694 695 void fragment(const BlockIt Start, const BlockIt End) override { 696 unsigned Fragment = 0; 697 for (BinaryBasicBlock *const BB : llvm::make_range(Start, End)) 698 BB->setFragmentNum(FragmentNum(Fragment++)); 699 } 700 }; 701 } // namespace 702 703 namespace llvm { 704 namespace bolt { 705 706 bool SplitFunctions::shouldOptimize(const BinaryFunction &BF) const { 707 // Apply execution count threshold 708 if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold) 709 return false; 710 711 return BinaryFunctionPass::shouldOptimize(BF); 712 } 713 714 Error SplitFunctions::runOnFunctions(BinaryContext &BC) { 715 if (!opts::SplitFunctions) 716 return Error::success(); 717 718 // If split strategy is not CDSplit, then a second run of the pass is not 719 // needed after function reordering. 720 if (BC.HasFinalizedFunctionOrder && 721 opts::SplitStrategy != SplitFunctionsStrategy::CDSplit) 722 return Error::success(); 723 724 std::unique_ptr<SplitStrategy> Strategy; 725 bool ForceSequential = false; 726 727 switch (opts::SplitStrategy) { 728 case SplitFunctionsStrategy::CDSplit: 729 // CDSplit runs two splitting passes: hot-cold splitting (SplitPrfoile2) 730 // before function reordering and hot-warm-cold splitting 731 // (SplitCacheDirected) after function reordering. 732 if (BC.HasFinalizedFunctionOrder) 733 Strategy = std::make_unique<SplitCacheDirected>(BC); 734 else 735 Strategy = std::make_unique<SplitProfile2>(); 736 opts::AggressiveSplitting = true; 737 BC.HasWarmSection = true; 738 break; 739 case SplitFunctionsStrategy::Profile2: 740 Strategy = std::make_unique<SplitProfile2>(); 741 break; 742 case SplitFunctionsStrategy::Random2: 743 Strategy = std::make_unique<SplitRandom2>(); 744 // If we split functions randomly, we need to ensure that across runs with 745 // the same input, we generate random numbers for each function in the same 746 // order. 747 ForceSequential = true; 748 break; 749 case SplitFunctionsStrategy::RandomN: 750 Strategy = std::make_unique<SplitRandomN>(); 751 ForceSequential = true; 752 break; 753 case SplitFunctionsStrategy::All: 754 Strategy = std::make_unique<SplitAll>(); 755 break; 756 } 757 758 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 759 return !shouldOptimize(BF); 760 }; 761 762 ParallelUtilities::runOnEachFunction( 763 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, 764 [&](BinaryFunction &BF) { splitFunction(BF, *Strategy); }, SkipFunc, 765 "SplitFunctions", ForceSequential); 766 767 if (SplitBytesHot + SplitBytesCold > 0) 768 BC.outs() << "BOLT-INFO: splitting separates " << SplitBytesHot 769 << " hot bytes from " << SplitBytesCold << " cold bytes " 770 << format("(%.2lf%% of split functions is hot).\n", 771 100.0 * SplitBytesHot / 772 (SplitBytesHot + SplitBytesCold)); 773 return Error::success(); 774 } 775 776 void SplitFunctions::splitFunction(BinaryFunction &BF, SplitStrategy &S) { 777 if (BF.empty()) 778 return; 779 780 if (!S.canSplit(BF)) 781 return; 782 783 FunctionLayout &Layout = BF.getLayout(); 784 BinaryFunction::BasicBlockOrderType PreSplitLayout(Layout.block_begin(), 785 Layout.block_end()); 786 787 BinaryContext &BC = BF.getBinaryContext(); 788 size_t OriginalHotSize; 789 size_t HotSize; 790 size_t ColdSize; 791 if (BC.isX86()) { 792 std::tie(OriginalHotSize, ColdSize) = BC.calculateEmittedSize(BF); 793 LLVM_DEBUG(dbgs() << "Estimated size for function " << BF 794 << " pre-split is <0x" 795 << Twine::utohexstr(OriginalHotSize) << ", 0x" 796 << Twine::utohexstr(ColdSize) << ">\n"); 797 } 798 799 BinaryFunction::BasicBlockOrderType NewLayout(Layout.block_begin(), 800 Layout.block_end()); 801 // Never outline the first basic block. 802 NewLayout.front()->setCanOutline(false); 803 for (BinaryBasicBlock *const BB : NewLayout) { 804 if (!BB->canOutline()) 805 continue; 806 807 // Do not split extra entry points in aarch64. They can be referred by 808 // using ADRs and when this happens, these blocks cannot be placed far 809 // away due to the limited range in ADR instruction. 810 if (BC.isAArch64() && BB->isEntryPoint()) { 811 BB->setCanOutline(false); 812 continue; 813 } 814 815 if (BF.hasEHRanges() && !opts::SplitEH) { 816 // We cannot move landing pads (or rather entry points for landing pads). 817 if (BB->isLandingPad()) { 818 BB->setCanOutline(false); 819 continue; 820 } 821 // We cannot move a block that can throw since exception-handling 822 // runtime cannot deal with split functions. However, if we can guarantee 823 // that the block never throws, it is safe to move the block to 824 // decrease the size of the function. 825 for (MCInst &Instr : *BB) { 826 if (BC.MIB->isInvoke(Instr)) { 827 BB->setCanOutline(false); 828 break; 829 } 830 } 831 } 832 } 833 834 BF.getLayout().updateLayoutIndices(); 835 S.fragment(NewLayout.begin(), NewLayout.end()); 836 837 // Make sure all non-outlineable blocks are in the main-fragment. 838 for (BinaryBasicBlock *const BB : NewLayout) { 839 if (!BB->canOutline()) 840 BB->setFragmentNum(FragmentNum::main()); 841 } 842 843 if (opts::AggressiveSplitting) { 844 // All blocks with 0 count that we can move go to the end of the function. 845 // Even if they were natural to cluster formation and were seen in-between 846 // hot basic blocks. 847 llvm::stable_sort(NewLayout, [&](const BinaryBasicBlock *const A, 848 const BinaryBasicBlock *const B) { 849 return A->getFragmentNum() < B->getFragmentNum(); 850 }); 851 } else if (BF.hasEHRanges() && !opts::SplitEH) { 852 // Typically functions with exception handling have landing pads at the end. 853 // We cannot move beginning of landing pads, but we can move 0-count blocks 854 // comprising landing pads to the end and thus facilitate splitting. 855 auto FirstLP = NewLayout.begin(); 856 while ((*FirstLP)->isLandingPad()) 857 ++FirstLP; 858 859 std::stable_sort(FirstLP, NewLayout.end(), 860 [&](BinaryBasicBlock *A, BinaryBasicBlock *B) { 861 return A->getFragmentNum() < B->getFragmentNum(); 862 }); 863 } 864 865 // Make sure that fragments are increasing. 866 FragmentNum CurrentFragment = NewLayout.back()->getFragmentNum(); 867 for (BinaryBasicBlock *const BB : reverse(NewLayout)) { 868 if (BB->getFragmentNum() > CurrentFragment) 869 BB->setFragmentNum(CurrentFragment); 870 CurrentFragment = BB->getFragmentNum(); 871 } 872 873 if (S.compactFragments()) { 874 FragmentNum CurrentFragment = FragmentNum::main(); 875 FragmentNum NewFragment = FragmentNum::main(); 876 for (BinaryBasicBlock *const BB : NewLayout) { 877 if (BB->getFragmentNum() > CurrentFragment) { 878 CurrentFragment = BB->getFragmentNum(); 879 NewFragment = FragmentNum(NewFragment.get() + 1); 880 } 881 BB->setFragmentNum(NewFragment); 882 } 883 } 884 885 const bool LayoutUpdated = BF.getLayout().update(NewLayout); 886 887 // For shared objects, invoke instructions and corresponding landing pads 888 // have to be placed in the same fragment. When we split them, create 889 // trampoline landing pads that will redirect the execution to real LPs. 890 TrampolineSetType Trampolines; 891 if (!BC.HasFixedLoadAddress && BF.hasEHRanges() && BF.isSplit()) 892 Trampolines = createEHTrampolines(BF); 893 894 // Check the new size to see if it's worth splitting the function. 895 if (BC.isX86() && LayoutUpdated) { 896 std::tie(HotSize, ColdSize) = BC.calculateEmittedSize(BF); 897 LLVM_DEBUG(dbgs() << "Estimated size for function " << BF 898 << " post-split is <0x" << Twine::utohexstr(HotSize) 899 << ", 0x" << Twine::utohexstr(ColdSize) << ">\n"); 900 if (alignTo(OriginalHotSize, opts::SplitAlignThreshold) <= 901 alignTo(HotSize, opts::SplitAlignThreshold) + opts::SplitThreshold) { 902 if (opts::Verbosity >= 2) { 903 BC.outs() << "BOLT-INFO: Reversing splitting of function " 904 << formatv("{0}:\n {1:x}, {2:x} -> {3:x}\n", BF, HotSize, 905 ColdSize, OriginalHotSize); 906 } 907 908 // Reverse the action of createEHTrampolines(). The trampolines will be 909 // placed immediately before the matching destination resulting in no 910 // extra code. 911 if (PreSplitLayout.size() != BF.size()) 912 PreSplitLayout = mergeEHTrampolines(BF, PreSplitLayout, Trampolines); 913 914 for (BinaryBasicBlock &BB : BF) 915 BB.setFragmentNum(FragmentNum::main()); 916 BF.getLayout().update(PreSplitLayout); 917 } else { 918 SplitBytesHot += HotSize; 919 SplitBytesCold += ColdSize; 920 } 921 } 922 923 // Fix branches if the splitting decision of the pass after function 924 // reordering is different from that of the pass before function reordering. 925 if (LayoutUpdated && BC.HasFinalizedFunctionOrder) 926 BF.fixBranches(); 927 } 928 929 SplitFunctions::TrampolineSetType 930 SplitFunctions::createEHTrampolines(BinaryFunction &BF) const { 931 const auto &MIB = BF.getBinaryContext().MIB; 932 933 // Map real landing pads to the corresponding trampolines. 934 TrampolineSetType LPTrampolines; 935 936 // Iterate over the copy of basic blocks since we are adding new blocks to the 937 // function which will invalidate its iterators. 938 std::vector<BinaryBasicBlock *> Blocks(BF.pbegin(), BF.pend()); 939 for (BinaryBasicBlock *BB : Blocks) { 940 for (MCInst &Instr : *BB) { 941 const std::optional<MCPlus::MCLandingPad> EHInfo = MIB->getEHInfo(Instr); 942 if (!EHInfo || !EHInfo->first) 943 continue; 944 945 const MCSymbol *LPLabel = EHInfo->first; 946 BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(LPLabel); 947 if (BB->getFragmentNum() == LPBlock->getFragmentNum()) 948 continue; 949 950 const MCSymbol *TrampolineLabel = nullptr; 951 const TrampolineKey Key(BB->getFragmentNum(), LPLabel); 952 auto Iter = LPTrampolines.find(Key); 953 if (Iter != LPTrampolines.end()) { 954 TrampolineLabel = Iter->second; 955 } else { 956 // Create a trampoline basic block in the same fragment as the thrower. 957 // Note: there's no need to insert the jump instruction, it will be 958 // added by fixBranches(). 959 BinaryBasicBlock *TrampolineBB = BF.addBasicBlock(); 960 TrampolineBB->setFragmentNum(BB->getFragmentNum()); 961 TrampolineBB->setExecutionCount(LPBlock->getExecutionCount()); 962 TrampolineBB->addSuccessor(LPBlock, TrampolineBB->getExecutionCount()); 963 TrampolineBB->setCFIState(LPBlock->getCFIState()); 964 TrampolineLabel = TrampolineBB->getLabel(); 965 LPTrampolines.insert(std::make_pair(Key, TrampolineLabel)); 966 } 967 968 // Substitute the landing pad with the trampoline. 969 MIB->updateEHInfo(Instr, 970 MCPlus::MCLandingPad(TrampolineLabel, EHInfo->second)); 971 } 972 } 973 974 if (LPTrampolines.empty()) 975 return LPTrampolines; 976 977 // All trampoline blocks were added to the end of the function. Place them at 978 // the end of corresponding fragments. 979 BinaryFunction::BasicBlockOrderType NewLayout(BF.getLayout().block_begin(), 980 BF.getLayout().block_end()); 981 stable_sort(NewLayout, [&](BinaryBasicBlock *A, BinaryBasicBlock *B) { 982 return A->getFragmentNum() < B->getFragmentNum(); 983 }); 984 BF.getLayout().update(NewLayout); 985 986 // Conservatively introduce branch instructions. 987 BF.fixBranches(); 988 989 // Update exception-handling CFG for the function. 990 BF.recomputeLandingPads(); 991 992 return LPTrampolines; 993 } 994 995 SplitFunctions::BasicBlockOrderType SplitFunctions::mergeEHTrampolines( 996 BinaryFunction &BF, SplitFunctions::BasicBlockOrderType &Layout, 997 const SplitFunctions::TrampolineSetType &Trampolines) const { 998 DenseMap<const MCSymbol *, SmallVector<const MCSymbol *, 0>> 999 IncomingTrampolines; 1000 for (const auto &Entry : Trampolines) { 1001 IncomingTrampolines[Entry.getFirst().Target].emplace_back( 1002 Entry.getSecond()); 1003 } 1004 1005 BasicBlockOrderType MergedLayout; 1006 for (BinaryBasicBlock *BB : Layout) { 1007 auto Iter = IncomingTrampolines.find(BB->getLabel()); 1008 if (Iter != IncomingTrampolines.end()) { 1009 for (const MCSymbol *const Trampoline : Iter->getSecond()) { 1010 BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(Trampoline); 1011 assert(LPBlock && "Could not find matching landing pad block."); 1012 MergedLayout.push_back(LPBlock); 1013 } 1014 } 1015 MergedLayout.push_back(BB); 1016 } 1017 1018 return MergedLayout; 1019 } 1020 1021 } // namespace bolt 1022 } // namespace llvm 1023