1 //===- bolt/Passes/SplitFunctions.cpp - Pass for splitting function code --===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the SplitFunctions pass. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "bolt/Passes/SplitFunctions.h" 14 #include "bolt/Core/BinaryBasicBlock.h" 15 #include "bolt/Core/BinaryFunction.h" 16 #include "bolt/Core/FunctionLayout.h" 17 #include "bolt/Core/ParallelUtilities.h" 18 #include "bolt/Utils/CommandLineOpts.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/Sequence.h" 21 #include "llvm/ADT/SmallVector.h" 22 #include "llvm/ADT/iterator_range.h" 23 #include "llvm/Support/CommandLine.h" 24 #include "llvm/Support/FormatVariadic.h" 25 #include <algorithm> 26 #include <iterator> 27 #include <memory> 28 #include <numeric> 29 #include <random> 30 #include <vector> 31 32 #define DEBUG_TYPE "bolt-opts" 33 34 using namespace llvm; 35 using namespace bolt; 36 37 namespace { 38 class DeprecatedSplitFunctionOptionParser : public cl::parser<bool> { 39 public: 40 explicit DeprecatedSplitFunctionOptionParser(cl::Option &O) 41 : cl::parser<bool>(O) {} 42 43 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, bool &Value) { 44 if (Arg == "2" || Arg == "3") { 45 Value = true; 46 errs() << formatv("BOLT-WARNING: specifying non-boolean value \"{0}\" " 47 "for option -{1} is deprecated\n", 48 Arg, ArgName); 49 return false; 50 } 51 return cl::parser<bool>::parse(O, ArgName, Arg, Value); 52 } 53 }; 54 } // namespace 55 56 namespace opts { 57 58 extern cl::OptionCategory BoltOptCategory; 59 60 extern cl::opt<bool> SplitEH; 61 extern cl::opt<unsigned> ExecutionCountThreshold; 62 extern cl::opt<uint32_t> RandomSeed; 63 64 static cl::opt<bool> AggressiveSplitting( 65 "split-all-cold", cl::desc("outline as many cold basic blocks as possible"), 66 cl::cat(BoltOptCategory)); 67 68 static cl::opt<unsigned> SplitAlignThreshold( 69 "split-align-threshold", 70 cl::desc("when deciding to split a function, apply this alignment " 71 "while doing the size comparison (see -split-threshold). " 72 "Default value: 2."), 73 cl::init(2), 74 75 cl::Hidden, cl::cat(BoltOptCategory)); 76 77 static cl::opt<bool, false, DeprecatedSplitFunctionOptionParser> 78 SplitFunctions("split-functions", 79 cl::desc("split functions into fragments"), 80 cl::cat(BoltOptCategory)); 81 82 static cl::opt<unsigned> SplitThreshold( 83 "split-threshold", 84 cl::desc("split function only if its main size is reduced by more than " 85 "given amount of bytes. Default value: 0, i.e. split iff the " 86 "size is reduced. Note that on some architectures the size can " 87 "increase after splitting."), 88 cl::init(0), cl::Hidden, cl::cat(BoltOptCategory)); 89 90 static cl::opt<SplitFunctionsStrategy> SplitStrategy( 91 "split-strategy", cl::init(SplitFunctionsStrategy::Profile2), 92 cl::values(clEnumValN(SplitFunctionsStrategy::Profile2, "profile2", 93 "split each function into a hot and cold fragment " 94 "using profiling information")), 95 cl::values(clEnumValN(SplitFunctionsStrategy::CDSplit, "cdsplit", 96 "split each function into a hot, warm, and cold " 97 "fragment using profiling information")), 98 cl::values(clEnumValN( 99 SplitFunctionsStrategy::Random2, "random2", 100 "split each function into a hot and cold fragment at a randomly chosen " 101 "split point (ignoring any available profiling information)")), 102 cl::values(clEnumValN( 103 SplitFunctionsStrategy::RandomN, "randomN", 104 "split each function into N fragments at a randomly chosen split " 105 "points (ignoring any available profiling information)")), 106 cl::values(clEnumValN( 107 SplitFunctionsStrategy::All, "all", 108 "split all basic blocks of each function into fragments such that each " 109 "fragment contains exactly a single basic block")), 110 cl::desc("strategy used to partition blocks into fragments"), 111 cl::cat(BoltOptCategory)); 112 113 static cl::opt<double> CallScale( 114 "call-scale", 115 cl::desc("Call score scale coefficient (when --split-strategy=cdsplit)"), 116 cl::init(0.95), cl::ReallyHidden, cl::cat(BoltOptCategory)); 117 118 static cl::opt<double> 119 CallPower("call-power", 120 cl::desc("Call score power (when --split-strategy=cdsplit)"), 121 cl::init(0.05), cl::ReallyHidden, cl::cat(BoltOptCategory)); 122 123 static cl::opt<double> 124 JumpPower("jump-power", 125 cl::desc("Jump score power (when --split-strategy=cdsplit)"), 126 cl::init(0.15), cl::ReallyHidden, cl::cat(BoltOptCategory)); 127 } // namespace opts 128 129 namespace { 130 bool hasFullProfile(const BinaryFunction &BF) { 131 return llvm::all_of(BF.blocks(), [](const BinaryBasicBlock &BB) { 132 return BB.getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE; 133 }); 134 } 135 136 bool allBlocksCold(const BinaryFunction &BF) { 137 return llvm::all_of(BF.blocks(), [](const BinaryBasicBlock &BB) { 138 return BB.getExecutionCount() == 0; 139 }); 140 } 141 142 struct SplitProfile2 final : public SplitStrategy { 143 bool canSplit(const BinaryFunction &BF) override { 144 return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF); 145 } 146 147 bool compactFragments() override { return true; } 148 149 void fragment(const BlockIt Start, const BlockIt End) override { 150 for (BinaryBasicBlock *const BB : llvm::make_range(Start, End)) { 151 if (BB->getExecutionCount() == 0) 152 BB->setFragmentNum(FragmentNum::cold()); 153 } 154 } 155 }; 156 157 struct SplitCacheDirected final : public SplitStrategy { 158 BinaryContext &BC; 159 using BasicBlockOrder = BinaryFunction::BasicBlockOrderType; 160 161 bool canSplit(const BinaryFunction &BF) override { 162 return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF); 163 } 164 165 explicit SplitCacheDirected(BinaryContext &BC) : BC(BC) { 166 initializeAuxiliaryVariables(); 167 buildCallGraph(); 168 } 169 170 // When some functions are hot-warm split and others are hot-warm-cold split, 171 // we do not want to change the fragment numbers of the blocks in the hot-warm 172 // split functions. 173 bool compactFragments() override { return false; } 174 175 void fragment(const BlockIt Start, const BlockIt End) override { 176 BasicBlockOrder BlockOrder(Start, End); 177 BinaryFunction &BF = *BlockOrder.front()->getFunction(); 178 179 size_t BestSplitIndex = findSplitIndex(BF, BlockOrder); 180 181 // Assign fragments based on the computed best split index. 182 // All basic blocks with index up to the best split index become hot. 183 // All remaining blocks are warm / cold depending on if count is 184 // greater than zero or not. 185 for (size_t Index = 0; Index < BlockOrder.size(); Index++) { 186 BinaryBasicBlock *BB = BlockOrder[Index]; 187 if (Index <= BestSplitIndex) 188 BB->setFragmentNum(FragmentNum::main()); 189 else 190 BB->setFragmentNum(BB->getKnownExecutionCount() > 0 191 ? FragmentNum::warm() 192 : FragmentNum::cold()); 193 } 194 } 195 196 private: 197 struct JumpInfo { 198 bool HasUncondBranch = false; 199 BinaryBasicBlock *CondSuccessor = nullptr; 200 BinaryBasicBlock *UncondSuccessor = nullptr; 201 }; 202 203 struct CallInfo { 204 size_t Length; 205 size_t Count; 206 }; 207 208 struct SplitScore { 209 size_t SplitIndex; 210 size_t HotSizeReduction = 0; 211 double LocalScore = 0; 212 double CoverCallScore = 0; 213 }; 214 215 // Auxiliary variables used by the algorithm. 216 size_t TotalNumBlocks{0}; 217 size_t OrigHotSectionSize{0}; 218 DenseMap<const BinaryBasicBlock *, size_t> GlobalIndices; 219 DenseMap<const BinaryBasicBlock *, size_t> BBSizes; 220 DenseMap<const BinaryBasicBlock *, size_t> BBOffsets; 221 DenseMap<const BinaryBasicBlock *, JumpInfo> JumpInfos; 222 223 // Call graph. 224 std::vector<SmallVector<const BinaryBasicBlock *, 0>> Callers; 225 std::vector<SmallVector<const BinaryBasicBlock *, 0>> Callees; 226 227 bool shouldConsiderForCallGraph(const BinaryFunction &BF) { 228 // Only a subset of the functions in the binary will be considered 229 // for initializing auxiliary variables and building call graph. 230 return BF.hasValidIndex() && BF.hasValidProfile() && !BF.empty(); 231 } 232 233 void initializeAuxiliaryVariables() { 234 // Gather information about conditional and unconditional successors of 235 // each basic block; this information will be used to estimate block size 236 // increase due to hot-warm splitting. 237 auto analyzeBranches = [&](BinaryBasicBlock &BB) { 238 JumpInfo BBJumpInfo; 239 const MCSymbol *TBB = nullptr; 240 const MCSymbol *FBB = nullptr; 241 MCInst *CondBranch = nullptr; 242 MCInst *UncondBranch = nullptr; 243 if (BB.analyzeBranch(TBB, FBB, CondBranch, UncondBranch)) { 244 BBJumpInfo.HasUncondBranch = UncondBranch != nullptr; 245 if (BB.succ_size() == 1) { 246 BBJumpInfo.UncondSuccessor = BB.getSuccessor(); 247 } else if (BB.succ_size() == 2) { 248 BBJumpInfo.CondSuccessor = BB.getConditionalSuccessor(true); 249 BBJumpInfo.UncondSuccessor = BB.getConditionalSuccessor(false); 250 } 251 } 252 return BBJumpInfo; 253 }; 254 255 for (BinaryFunction *BF : BC.getSortedFunctions()) { 256 if (!shouldConsiderForCallGraph(*BF)) 257 continue; 258 259 // Calculate the size of each BB after hot-cold splitting. 260 // This populates BinaryBasicBlock::OutputAddressRange which 261 // can be used to compute the size of each BB. 262 BC.calculateEmittedSize(*BF, /*FixBranches=*/true); 263 264 for (BinaryBasicBlock *BB : BF->getLayout().blocks()) { 265 // Unique global index. 266 GlobalIndices[BB] = TotalNumBlocks; 267 TotalNumBlocks++; 268 269 // Block size after hot-cold splitting. 270 BBSizes[BB] = BB->getOutputSize(); 271 272 // Hot block offset after hot-cold splitting. 273 BBOffsets[BB] = OrigHotSectionSize; 274 if (!BB->isSplit()) 275 OrigHotSectionSize += BBSizes[BB]; 276 277 // (Un)Conditional branch instruction information. 278 JumpInfos[BB] = analyzeBranches(*BB); 279 } 280 } 281 } 282 283 void buildCallGraph() { 284 Callers.resize(TotalNumBlocks); 285 Callees.resize(TotalNumBlocks); 286 for (const BinaryFunction *SrcFunction : BC.getSortedFunctions()) { 287 if (!shouldConsiderForCallGraph(*SrcFunction)) 288 continue; 289 290 for (BinaryBasicBlock &SrcBB : SrcFunction->blocks()) { 291 // Skip blocks that are not executed 292 if (SrcBB.getKnownExecutionCount() == 0) 293 continue; 294 295 // Find call instructions and extract target symbols from each one 296 for (const MCInst &Inst : SrcBB) { 297 if (!BC.MIB->isCall(Inst)) 298 continue; 299 300 // Call info 301 const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst); 302 // Ignore calls w/o information 303 if (!DstSym) 304 continue; 305 306 const BinaryFunction *DstFunction = BC.getFunctionForSymbol(DstSym); 307 // Ignore calls that do not have a valid target, but do not ignore 308 // recursive calls, because caller block could be moved to warm. 309 if (!DstFunction || DstFunction->getLayout().block_empty()) 310 continue; 311 312 const BinaryBasicBlock *DstBB = &(DstFunction->front()); 313 314 // Record the call only if DstBB is also in functions to consider for 315 // call graph. 316 if (GlobalIndices.contains(DstBB)) { 317 Callers[GlobalIndices[DstBB]].push_back(&SrcBB); 318 Callees[GlobalIndices[&SrcBB]].push_back(DstBB); 319 } 320 } 321 } 322 } 323 } 324 325 /// Populate BinaryBasicBlock::OutputAddressRange with estimated basic block 326 /// start and end addresses for hot and warm basic blocks, assuming hot-warm 327 /// splitting happens at \p SplitIndex. Also return estimated end addresses 328 /// of the hot fragment before and after splitting. 329 /// The estimations take into account the potential addition of branch 330 /// instructions due to split fall through branches as well as the need to 331 /// use longer branch instructions for split (un)conditional branches. 332 std::pair<size_t, size_t> 333 estimatePostSplitBBAddress(const BasicBlockOrder &BlockOrder, 334 const size_t SplitIndex) { 335 assert(SplitIndex < BlockOrder.size() && "Invalid split index"); 336 337 // Update function layout assuming hot-warm splitting at SplitIndex 338 for (size_t Index = 0; Index < BlockOrder.size(); Index++) { 339 BinaryBasicBlock *BB = BlockOrder[Index]; 340 if (BB->getFragmentNum() == FragmentNum::cold()) 341 break; 342 BB->setFragmentNum(Index <= SplitIndex ? FragmentNum::main() 343 : FragmentNum::warm()); 344 } 345 BinaryFunction *BF = BlockOrder[0]->getFunction(); 346 BF->getLayout().update(BlockOrder); 347 // Populate BB.OutputAddressRange under the updated layout. 348 BC.calculateEmittedSize(*BF); 349 350 // Populate BB.OutputAddressRange with estimated new start and end addresses 351 // and compute the old end address of the hot section and the new end 352 // address of the hot section. 353 size_t OldHotEndAddr; 354 size_t NewHotEndAddr; 355 size_t CurrentAddr = BBOffsets[BlockOrder[0]]; 356 for (BinaryBasicBlock *BB : BlockOrder) { 357 // We only care about new addresses of blocks in hot/warm. 358 if (BB->getFragmentNum() == FragmentNum::cold()) 359 break; 360 const size_t NewSize = BB->getOutputSize(); 361 BB->setOutputStartAddress(CurrentAddr); 362 CurrentAddr += NewSize; 363 BB->setOutputEndAddress(CurrentAddr); 364 if (BB->getLayoutIndex() == SplitIndex) { 365 NewHotEndAddr = CurrentAddr; 366 // Approximate the start address of the warm fragment of the current 367 // function using the original hot section size. 368 CurrentAddr = OrigHotSectionSize; 369 } 370 OldHotEndAddr = BBOffsets[BB] + BBSizes[BB]; 371 } 372 return std::make_pair(OldHotEndAddr, NewHotEndAddr); 373 } 374 375 /// Get a collection of "shortenable" calls, that is, calls of type X->Y 376 /// when the function order is [... X ... BF ... Y ...]. 377 /// If the hot fragment size of BF is reduced, then such calls are guaranteed 378 /// to get shorter by the reduced hot fragment size. 379 std::vector<CallInfo> extractCoverCalls(const BinaryFunction &BF) { 380 // Record the length and the count of the calls that can be shortened 381 std::vector<CallInfo> CoverCalls; 382 if (opts::CallScale == 0) 383 return CoverCalls; 384 385 const BinaryFunction *ThisBF = &BF; 386 const BinaryBasicBlock *ThisBB = &(ThisBF->front()); 387 const size_t ThisGI = GlobalIndices[ThisBB]; 388 389 for (const BinaryFunction *DstBF : BC.getSortedFunctions()) { 390 if (!shouldConsiderForCallGraph(*DstBF)) 391 continue; 392 393 const BinaryBasicBlock *DstBB = &(DstBF->front()); 394 if (DstBB->getKnownExecutionCount() == 0) 395 continue; 396 397 const size_t DstGI = GlobalIndices[DstBB]; 398 for (const BinaryBasicBlock *SrcBB : Callers[DstGI]) { 399 const BinaryFunction *SrcBF = SrcBB->getFunction(); 400 if (ThisBF == SrcBF) 401 continue; 402 403 const size_t CallCount = SrcBB->getKnownExecutionCount(); 404 405 const size_t SrcGI = GlobalIndices[SrcBB]; 406 407 const bool IsCoverCall = (SrcGI < ThisGI && ThisGI < DstGI) || 408 (DstGI <= ThisGI && ThisGI < SrcGI); 409 if (!IsCoverCall) 410 continue; 411 412 const size_t SrcBBEndAddr = BBOffsets[SrcBB] + BBSizes[SrcBB]; 413 const size_t DstBBStartAddr = BBOffsets[DstBB]; 414 const size_t CallLength = 415 AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr); 416 const CallInfo CI{CallLength, CallCount}; 417 CoverCalls.emplace_back(CI); 418 } 419 } 420 return CoverCalls; 421 } 422 423 /// Compute the edge score of a call edge. 424 double computeCallScore(uint64_t CallCount, size_t CallLength) { 425 // Increase call lengths by 1 to avoid raising 0 to a negative power. 426 return opts::CallScale * static_cast<double>(CallCount) / 427 std::pow(static_cast<double>(CallLength + 1), opts::CallPower); 428 } 429 430 /// Compute the edge score of a jump (branch) edge. 431 double computeJumpScore(uint64_t JumpCount, size_t JumpLength) { 432 // Increase jump lengths by 1 to avoid raising 0 to a negative power. 433 return static_cast<double>(JumpCount) / 434 std::pow(static_cast<double>(JumpLength + 1), opts::JumpPower); 435 } 436 437 /// Compute sum of scores over jumps within \p BlockOrder given \p SplitIndex. 438 /// Increament Score.LocalScore in place by the sum. 439 void computeJumpScore(const BasicBlockOrder &BlockOrder, 440 const size_t SplitIndex, SplitScore &Score) { 441 442 for (const BinaryBasicBlock *SrcBB : BlockOrder) { 443 if (SrcBB->getKnownExecutionCount() == 0) 444 continue; 445 446 const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second; 447 448 for (const auto Pair : zip(SrcBB->successors(), SrcBB->branch_info())) { 449 const BinaryBasicBlock *DstBB = std::get<0>(Pair); 450 const BinaryBasicBlock::BinaryBranchInfo &Branch = std::get<1>(Pair); 451 const size_t JumpCount = Branch.Count; 452 453 if (JumpCount == 0) 454 continue; 455 456 const size_t DstBBStartAddr = DstBB->getOutputAddressRange().first; 457 const size_t NewJumpLength = 458 AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr); 459 Score.LocalScore += computeJumpScore(JumpCount, NewJumpLength); 460 } 461 } 462 } 463 464 /// Compute sum of scores over calls originated in the current function 465 /// given \p SplitIndex. Increament Score.LocalScore in place by the sum. 466 void computeLocalCallScore(const BasicBlockOrder &BlockOrder, 467 const size_t SplitIndex, SplitScore &Score) { 468 if (opts::CallScale == 0) 469 return; 470 471 // Global index of the last block in the current function. 472 // This is later used to determine whether a call originated in the current 473 // function is to a function that comes after the current function. 474 const size_t LastGlobalIndex = GlobalIndices[BlockOrder.back()]; 475 476 // The length of calls originated in the input function can increase / 477 // decrease depending on the splitting decision. 478 for (const BinaryBasicBlock *SrcBB : BlockOrder) { 479 const size_t CallCount = SrcBB->getKnownExecutionCount(); 480 // If SrcBB does not call any functions, skip it. 481 if (CallCount == 0) 482 continue; 483 484 // Obtain an estimate on the end address of the src basic block 485 // after splitting at SplitIndex. 486 const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second; 487 488 for (const BinaryBasicBlock *DstBB : Callees[GlobalIndices[SrcBB]]) { 489 // Obtain an estimate on the start address of the dst basic block 490 // after splitting at SplitIndex. If DstBB is in a function before 491 // the current function, then its start address remains unchanged. 492 size_t DstBBStartAddr = BBOffsets[DstBB]; 493 // If DstBB is in a function after the current function, then its 494 // start address should be adjusted based on the reduction in hot size. 495 if (GlobalIndices[DstBB] > LastGlobalIndex) { 496 assert(DstBBStartAddr >= Score.HotSizeReduction); 497 DstBBStartAddr -= Score.HotSizeReduction; 498 } 499 const size_t NewCallLength = 500 AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr); 501 Score.LocalScore += computeCallScore(CallCount, NewCallLength); 502 } 503 } 504 } 505 506 /// Compute sum of splitting scores for cover calls of the input function. 507 /// Increament Score.CoverCallScore in place by the sum. 508 void computeCoverCallScore(const BasicBlockOrder &BlockOrder, 509 const size_t SplitIndex, 510 const std::vector<CallInfo> &CoverCalls, 511 SplitScore &Score) { 512 if (opts::CallScale == 0) 513 return; 514 515 for (const CallInfo CI : CoverCalls) { 516 assert(CI.Length >= Score.HotSizeReduction && 517 "Length of cover calls must exceed reduced size of hot fragment."); 518 // Compute the new length of the call, which is shorter than the original 519 // one by the size of the splitted fragment minus the total size increase. 520 const size_t NewCallLength = CI.Length - Score.HotSizeReduction; 521 Score.CoverCallScore += computeCallScore(CI.Count, NewCallLength); 522 } 523 } 524 525 /// Compute the split score of splitting a function at a given index. 526 /// The split score consists of local score and cover score. Cover call score 527 /// is expensive to compute. As a result, we pass in a \p ReferenceScore and 528 /// compute cover score only when the local score exceeds that in the 529 /// ReferenceScore or that the size reduction of the hot fragment is larger 530 /// than that achieved by the split index of the ReferenceScore. This function 531 /// returns \p Score of SplitScore type. It contains the local score and cover 532 /// score (if computed) of the current splitting index. For easier book 533 /// keeping and comparison, it also stores the split index and the resulting 534 /// reduction in hot fragment size. 535 SplitScore computeSplitScore(const BinaryFunction &BF, 536 const BasicBlockOrder &BlockOrder, 537 const size_t SplitIndex, 538 const std::vector<CallInfo> &CoverCalls, 539 const SplitScore &ReferenceScore) { 540 // Populate BinaryBasicBlock::OutputAddressRange with estimated 541 // new start and end addresses after hot-warm splitting at SplitIndex. 542 size_t OldHotEnd; 543 size_t NewHotEnd; 544 std::tie(OldHotEnd, NewHotEnd) = 545 estimatePostSplitBBAddress(BlockOrder, SplitIndex); 546 547 SplitScore Score; 548 Score.SplitIndex = SplitIndex; 549 550 // It's not worth splitting if OldHotEnd < NewHotEnd. 551 if (OldHotEnd < NewHotEnd) 552 return Score; 553 554 // Hot fragment size reduction due to splitting. 555 Score.HotSizeReduction = OldHotEnd - NewHotEnd; 556 557 // First part of LocalScore is the sum over call edges originated in the 558 // input function. These edges can get shorter or longer depending on 559 // SplitIndex. Score.LocalScore is increamented in place. 560 computeLocalCallScore(BlockOrder, SplitIndex, Score); 561 562 // Second part of LocalScore is the sum over jump edges with src basic block 563 // and dst basic block in the current function. Score.LocalScore is 564 // increamented in place. 565 computeJumpScore(BlockOrder, SplitIndex, Score); 566 567 // There is no need to compute CoverCallScore if we have already found 568 // another split index with a bigger LocalScore and bigger HotSizeReduction. 569 if (Score.LocalScore <= ReferenceScore.LocalScore && 570 Score.HotSizeReduction <= ReferenceScore.HotSizeReduction) 571 return Score; 572 573 // Compute CoverCallScore and store in Score in place. 574 computeCoverCallScore(BlockOrder, SplitIndex, CoverCalls, Score); 575 return Score; 576 } 577 578 /// Find the best index for splitting. The returned value is the index of the 579 /// last hot basic block. Hence, "no splitting" is equivalent to returning the 580 /// value which is one less than the size of the function. 581 size_t findSplitIndex(const BinaryFunction &BF, 582 const BasicBlockOrder &BlockOrder) { 583 // Find all function calls that can be shortened if we move blocks of the 584 // current function to warm/cold 585 const std::vector<CallInfo> CoverCalls = extractCoverCalls(BF); 586 587 // Try all possible split indices (blocks with Index <= SplitIndex are in 588 // hot) and find the one maximizing the splitting score. 589 SplitScore BestScore; 590 double BestScoreSum = -1.0; 591 SplitScore ReferenceScore; 592 for (size_t Index = 0; Index < BlockOrder.size(); Index++) { 593 const BinaryBasicBlock *LastHotBB = BlockOrder[Index]; 594 // No need to keep cold blocks in the hot section. 595 if (LastHotBB->getFragmentNum() == FragmentNum::cold()) 596 break; 597 const SplitScore Score = 598 computeSplitScore(BF, BlockOrder, Index, CoverCalls, ReferenceScore); 599 double ScoreSum = Score.LocalScore + Score.CoverCallScore; 600 if (ScoreSum > BestScoreSum) { 601 BestScoreSum = ScoreSum; 602 BestScore = Score; 603 } 604 if (Score.LocalScore > ReferenceScore.LocalScore) 605 ReferenceScore = Score; 606 } 607 608 return BestScore.SplitIndex; 609 } 610 }; 611 612 struct SplitRandom2 final : public SplitStrategy { 613 std::minstd_rand0 Gen; 614 615 SplitRandom2() : Gen(opts::RandomSeed.getValue()) {} 616 617 bool canSplit(const BinaryFunction &BF) override { return true; } 618 619 bool compactFragments() override { return true; } 620 621 void fragment(const BlockIt Start, const BlockIt End) override { 622 using DiffT = typename std::iterator_traits<BlockIt>::difference_type; 623 const DiffT NumBlocks = End - Start; 624 assert(NumBlocks > 0 && "Cannot fragment empty function"); 625 626 // We want to split at least one block 627 const auto LastSplitPoint = std::max<DiffT>(NumBlocks - 1, 1); 628 std::uniform_int_distribution<DiffT> Dist(1, LastSplitPoint); 629 const DiffT SplitPoint = Dist(Gen); 630 for (BinaryBasicBlock *BB : llvm::make_range(Start + SplitPoint, End)) 631 BB->setFragmentNum(FragmentNum::cold()); 632 633 LLVM_DEBUG(dbgs() << formatv("BOLT-DEBUG: randomly chose last {0} (out of " 634 "{1} possible) blocks to split\n", 635 NumBlocks - SplitPoint, End - Start)); 636 } 637 }; 638 639 struct SplitRandomN final : public SplitStrategy { 640 std::minstd_rand0 Gen; 641 642 SplitRandomN() : Gen(opts::RandomSeed.getValue()) {} 643 644 bool canSplit(const BinaryFunction &BF) override { return true; } 645 646 bool compactFragments() override { return true; } 647 648 void fragment(const BlockIt Start, const BlockIt End) override { 649 using DiffT = typename std::iterator_traits<BlockIt>::difference_type; 650 const DiffT NumBlocks = End - Start; 651 assert(NumBlocks > 0 && "Cannot fragment empty function"); 652 653 // With n blocks, there are n-1 places to split them. 654 const DiffT MaximumSplits = NumBlocks - 1; 655 // We want to generate at least two fragment if possible, but if there is 656 // only one block, no splits are possible. 657 const auto MinimumSplits = std::min<DiffT>(MaximumSplits, 1); 658 std::uniform_int_distribution<DiffT> Dist(MinimumSplits, MaximumSplits); 659 // Choose how many splits to perform 660 const DiffT NumSplits = Dist(Gen); 661 662 // Draw split points from a lottery 663 SmallVector<unsigned, 0> Lottery(MaximumSplits); 664 // Start lottery at 1, because there is no meaningful splitpoint before the 665 // first block. 666 std::iota(Lottery.begin(), Lottery.end(), 1u); 667 std::shuffle(Lottery.begin(), Lottery.end(), Gen); 668 Lottery.resize(NumSplits); 669 llvm::sort(Lottery); 670 671 // Add one past the end entry to lottery 672 Lottery.push_back(NumBlocks); 673 674 unsigned LotteryIndex = 0; 675 unsigned BBPos = 0; 676 for (BinaryBasicBlock *const BB : make_range(Start, End)) { 677 // Check whether to start new fragment 678 if (BBPos >= Lottery[LotteryIndex]) 679 ++LotteryIndex; 680 681 // Because LotteryIndex is 0 based and cold fragments are 1 based, we can 682 // use the index to assign fragments. 683 BB->setFragmentNum(FragmentNum(LotteryIndex)); 684 685 ++BBPos; 686 } 687 } 688 }; 689 690 struct SplitAll final : public SplitStrategy { 691 bool canSplit(const BinaryFunction &BF) override { return true; } 692 693 bool compactFragments() override { 694 // Keeping empty fragments allows us to test, that empty fragments do not 695 // generate symbols. 696 return false; 697 } 698 699 void fragment(const BlockIt Start, const BlockIt End) override { 700 unsigned Fragment = 0; 701 for (BinaryBasicBlock *const BB : llvm::make_range(Start, End)) 702 BB->setFragmentNum(FragmentNum(Fragment++)); 703 } 704 }; 705 } // namespace 706 707 namespace llvm { 708 namespace bolt { 709 710 bool SplitFunctions::shouldOptimize(const BinaryFunction &BF) const { 711 // Apply execution count threshold 712 if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold) 713 return false; 714 715 return BinaryFunctionPass::shouldOptimize(BF); 716 } 717 718 void SplitFunctions::runOnFunctions(BinaryContext &BC) { 719 if (!opts::SplitFunctions) 720 return; 721 722 // If split strategy is not CDSplit, then a second run of the pass is not 723 // needed after function reordering. 724 if (BC.HasFinalizedFunctionOrder && 725 opts::SplitStrategy != SplitFunctionsStrategy::CDSplit) 726 return; 727 728 std::unique_ptr<SplitStrategy> Strategy; 729 bool ForceSequential = false; 730 731 switch (opts::SplitStrategy) { 732 case SplitFunctionsStrategy::CDSplit: 733 // CDSplit runs two splitting passes: hot-cold splitting (SplitPrfoile2) 734 // before function reordering and hot-warm-cold splitting 735 // (SplitCacheDirected) after function reordering. 736 if (BC.HasFinalizedFunctionOrder) 737 Strategy = std::make_unique<SplitCacheDirected>(BC); 738 else 739 Strategy = std::make_unique<SplitProfile2>(); 740 opts::AggressiveSplitting = true; 741 BC.HasWarmSection = true; 742 break; 743 case SplitFunctionsStrategy::Profile2: 744 Strategy = std::make_unique<SplitProfile2>(); 745 break; 746 case SplitFunctionsStrategy::Random2: 747 Strategy = std::make_unique<SplitRandom2>(); 748 // If we split functions randomly, we need to ensure that across runs with 749 // the same input, we generate random numbers for each function in the same 750 // order. 751 ForceSequential = true; 752 break; 753 case SplitFunctionsStrategy::RandomN: 754 Strategy = std::make_unique<SplitRandomN>(); 755 ForceSequential = true; 756 break; 757 case SplitFunctionsStrategy::All: 758 Strategy = std::make_unique<SplitAll>(); 759 break; 760 } 761 762 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 763 return !shouldOptimize(BF); 764 }; 765 766 ParallelUtilities::runOnEachFunction( 767 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, 768 [&](BinaryFunction &BF) { splitFunction(BF, *Strategy); }, SkipFunc, 769 "SplitFunctions", ForceSequential); 770 771 if (SplitBytesHot + SplitBytesCold > 0) 772 outs() << "BOLT-INFO: splitting separates " << SplitBytesHot 773 << " hot bytes from " << SplitBytesCold << " cold bytes " 774 << format("(%.2lf%% of split functions is hot).\n", 775 100.0 * SplitBytesHot / (SplitBytesHot + SplitBytesCold)); 776 } 777 778 void SplitFunctions::splitFunction(BinaryFunction &BF, SplitStrategy &S) { 779 if (BF.empty()) 780 return; 781 782 if (!S.canSplit(BF)) 783 return; 784 785 FunctionLayout &Layout = BF.getLayout(); 786 BinaryFunction::BasicBlockOrderType PreSplitLayout(Layout.block_begin(), 787 Layout.block_end()); 788 789 BinaryContext &BC = BF.getBinaryContext(); 790 size_t OriginalHotSize; 791 size_t HotSize; 792 size_t ColdSize; 793 if (BC.isX86()) { 794 std::tie(OriginalHotSize, ColdSize) = BC.calculateEmittedSize(BF); 795 LLVM_DEBUG(dbgs() << "Estimated size for function " << BF 796 << " pre-split is <0x" 797 << Twine::utohexstr(OriginalHotSize) << ", 0x" 798 << Twine::utohexstr(ColdSize) << ">\n"); 799 } 800 801 BinaryFunction::BasicBlockOrderType NewLayout(Layout.block_begin(), 802 Layout.block_end()); 803 // Never outline the first basic block. 804 NewLayout.front()->setCanOutline(false); 805 for (BinaryBasicBlock *const BB : NewLayout) { 806 if (!BB->canOutline()) 807 continue; 808 809 // Do not split extra entry points in aarch64. They can be referred by 810 // using ADRs and when this happens, these blocks cannot be placed far 811 // away due to the limited range in ADR instruction. 812 if (BC.isAArch64() && BB->isEntryPoint()) { 813 BB->setCanOutline(false); 814 continue; 815 } 816 817 if (BF.hasEHRanges() && !opts::SplitEH) { 818 // We cannot move landing pads (or rather entry points for landing pads). 819 if (BB->isLandingPad()) { 820 BB->setCanOutline(false); 821 continue; 822 } 823 // We cannot move a block that can throw since exception-handling 824 // runtime cannot deal with split functions. However, if we can guarantee 825 // that the block never throws, it is safe to move the block to 826 // decrease the size of the function. 827 for (MCInst &Instr : *BB) { 828 if (BC.MIB->isInvoke(Instr)) { 829 BB->setCanOutline(false); 830 break; 831 } 832 } 833 } 834 } 835 836 BF.getLayout().updateLayoutIndices(); 837 S.fragment(NewLayout.begin(), NewLayout.end()); 838 839 // Make sure all non-outlineable blocks are in the main-fragment. 840 for (BinaryBasicBlock *const BB : NewLayout) { 841 if (!BB->canOutline()) 842 BB->setFragmentNum(FragmentNum::main()); 843 } 844 845 if (opts::AggressiveSplitting) { 846 // All blocks with 0 count that we can move go to the end of the function. 847 // Even if they were natural to cluster formation and were seen in-between 848 // hot basic blocks. 849 llvm::stable_sort(NewLayout, [&](const BinaryBasicBlock *const A, 850 const BinaryBasicBlock *const B) { 851 return A->getFragmentNum() < B->getFragmentNum(); 852 }); 853 } else if (BF.hasEHRanges() && !opts::SplitEH) { 854 // Typically functions with exception handling have landing pads at the end. 855 // We cannot move beginning of landing pads, but we can move 0-count blocks 856 // comprising landing pads to the end and thus facilitate splitting. 857 auto FirstLP = NewLayout.begin(); 858 while ((*FirstLP)->isLandingPad()) 859 ++FirstLP; 860 861 std::stable_sort(FirstLP, NewLayout.end(), 862 [&](BinaryBasicBlock *A, BinaryBasicBlock *B) { 863 return A->getFragmentNum() < B->getFragmentNum(); 864 }); 865 } 866 867 // Make sure that fragments are increasing. 868 FragmentNum CurrentFragment = NewLayout.back()->getFragmentNum(); 869 for (BinaryBasicBlock *const BB : reverse(NewLayout)) { 870 if (BB->getFragmentNum() > CurrentFragment) 871 BB->setFragmentNum(CurrentFragment); 872 CurrentFragment = BB->getFragmentNum(); 873 } 874 875 if (S.compactFragments()) { 876 FragmentNum CurrentFragment = FragmentNum::main(); 877 FragmentNum NewFragment = FragmentNum::main(); 878 for (BinaryBasicBlock *const BB : NewLayout) { 879 if (BB->getFragmentNum() > CurrentFragment) { 880 CurrentFragment = BB->getFragmentNum(); 881 NewFragment = FragmentNum(NewFragment.get() + 1); 882 } 883 BB->setFragmentNum(NewFragment); 884 } 885 } 886 887 const bool LayoutUpdated = BF.getLayout().update(NewLayout); 888 889 // For shared objects, invoke instructions and corresponding landing pads 890 // have to be placed in the same fragment. When we split them, create 891 // trampoline landing pads that will redirect the execution to real LPs. 892 TrampolineSetType Trampolines; 893 if (!BC.HasFixedLoadAddress && BF.hasEHRanges() && BF.isSplit()) 894 Trampolines = createEHTrampolines(BF); 895 896 // Check the new size to see if it's worth splitting the function. 897 if (BC.isX86() && LayoutUpdated) { 898 std::tie(HotSize, ColdSize) = BC.calculateEmittedSize(BF); 899 LLVM_DEBUG(dbgs() << "Estimated size for function " << BF 900 << " post-split is <0x" << Twine::utohexstr(HotSize) 901 << ", 0x" << Twine::utohexstr(ColdSize) << ">\n"); 902 if (alignTo(OriginalHotSize, opts::SplitAlignThreshold) <= 903 alignTo(HotSize, opts::SplitAlignThreshold) + opts::SplitThreshold) { 904 if (opts::Verbosity >= 2) { 905 outs() << "BOLT-INFO: Reversing splitting of function " 906 << formatv("{0}:\n {1:x}, {2:x} -> {3:x}\n", BF, HotSize, 907 ColdSize, OriginalHotSize); 908 } 909 910 // Reverse the action of createEHTrampolines(). The trampolines will be 911 // placed immediately before the matching destination resulting in no 912 // extra code. 913 if (PreSplitLayout.size() != BF.size()) 914 PreSplitLayout = mergeEHTrampolines(BF, PreSplitLayout, Trampolines); 915 916 for (BinaryBasicBlock &BB : BF) 917 BB.setFragmentNum(FragmentNum::main()); 918 BF.getLayout().update(PreSplitLayout); 919 } else { 920 SplitBytesHot += HotSize; 921 SplitBytesCold += ColdSize; 922 } 923 } 924 925 // Fix branches if the splitting decision of the pass after function 926 // reordering is different from that of the pass before function reordering. 927 if (LayoutUpdated && BC.HasFinalizedFunctionOrder) 928 BF.fixBranches(); 929 } 930 931 SplitFunctions::TrampolineSetType 932 SplitFunctions::createEHTrampolines(BinaryFunction &BF) const { 933 const auto &MIB = BF.getBinaryContext().MIB; 934 935 // Map real landing pads to the corresponding trampolines. 936 TrampolineSetType LPTrampolines; 937 938 // Iterate over the copy of basic blocks since we are adding new blocks to the 939 // function which will invalidate its iterators. 940 std::vector<BinaryBasicBlock *> Blocks(BF.pbegin(), BF.pend()); 941 for (BinaryBasicBlock *BB : Blocks) { 942 for (MCInst &Instr : *BB) { 943 const std::optional<MCPlus::MCLandingPad> EHInfo = MIB->getEHInfo(Instr); 944 if (!EHInfo || !EHInfo->first) 945 continue; 946 947 const MCSymbol *LPLabel = EHInfo->first; 948 BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(LPLabel); 949 if (BB->getFragmentNum() == LPBlock->getFragmentNum()) 950 continue; 951 952 const MCSymbol *TrampolineLabel = nullptr; 953 const TrampolineKey Key(BB->getFragmentNum(), LPLabel); 954 auto Iter = LPTrampolines.find(Key); 955 if (Iter != LPTrampolines.end()) { 956 TrampolineLabel = Iter->second; 957 } else { 958 // Create a trampoline basic block in the same fragment as the thrower. 959 // Note: there's no need to insert the jump instruction, it will be 960 // added by fixBranches(). 961 BinaryBasicBlock *TrampolineBB = BF.addBasicBlock(); 962 TrampolineBB->setFragmentNum(BB->getFragmentNum()); 963 TrampolineBB->setExecutionCount(LPBlock->getExecutionCount()); 964 TrampolineBB->addSuccessor(LPBlock, TrampolineBB->getExecutionCount()); 965 TrampolineBB->setCFIState(LPBlock->getCFIState()); 966 TrampolineLabel = TrampolineBB->getLabel(); 967 LPTrampolines.insert(std::make_pair(Key, TrampolineLabel)); 968 } 969 970 // Substitute the landing pad with the trampoline. 971 MIB->updateEHInfo(Instr, 972 MCPlus::MCLandingPad(TrampolineLabel, EHInfo->second)); 973 } 974 } 975 976 if (LPTrampolines.empty()) 977 return LPTrampolines; 978 979 // All trampoline blocks were added to the end of the function. Place them at 980 // the end of corresponding fragments. 981 BinaryFunction::BasicBlockOrderType NewLayout(BF.getLayout().block_begin(), 982 BF.getLayout().block_end()); 983 stable_sort(NewLayout, [&](BinaryBasicBlock *A, BinaryBasicBlock *B) { 984 return A->getFragmentNum() < B->getFragmentNum(); 985 }); 986 BF.getLayout().update(NewLayout); 987 988 // Conservatively introduce branch instructions. 989 BF.fixBranches(); 990 991 // Update exception-handling CFG for the function. 992 BF.recomputeLandingPads(); 993 994 return LPTrampolines; 995 } 996 997 SplitFunctions::BasicBlockOrderType SplitFunctions::mergeEHTrampolines( 998 BinaryFunction &BF, SplitFunctions::BasicBlockOrderType &Layout, 999 const SplitFunctions::TrampolineSetType &Trampolines) const { 1000 DenseMap<const MCSymbol *, SmallVector<const MCSymbol *, 0>> 1001 IncomingTrampolines; 1002 for (const auto &Entry : Trampolines) { 1003 IncomingTrampolines[Entry.getFirst().Target].emplace_back( 1004 Entry.getSecond()); 1005 } 1006 1007 BasicBlockOrderType MergedLayout; 1008 for (BinaryBasicBlock *BB : Layout) { 1009 auto Iter = IncomingTrampolines.find(BB->getLabel()); 1010 if (Iter != IncomingTrampolines.end()) { 1011 for (const MCSymbol *const Trampoline : Iter->getSecond()) { 1012 BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(Trampoline); 1013 assert(LPBlock && "Could not find matching landing pad block."); 1014 MergedLayout.push_back(LPBlock); 1015 } 1016 } 1017 MergedLayout.push_back(BB); 1018 } 1019 1020 return MergedLayout; 1021 } 1022 1023 } // namespace bolt 1024 } // namespace llvm 1025