1 //===- bolt/Passes/SplitFunctions.cpp - Pass for splitting function code --===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the SplitFunctions pass. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "bolt/Passes/SplitFunctions.h" 14 #include "bolt/Core/BinaryBasicBlock.h" 15 #include "bolt/Core/BinaryFunction.h" 16 #include "bolt/Core/FunctionLayout.h" 17 #include "bolt/Core/ParallelUtilities.h" 18 #include "bolt/Utils/CommandLineOpts.h" 19 #include "llvm/ADT/STLExtras.h" 20 #include "llvm/ADT/Sequence.h" 21 #include "llvm/ADT/SmallVector.h" 22 #include "llvm/ADT/iterator_range.h" 23 #include "llvm/Support/CommandLine.h" 24 #include "llvm/Support/FormatVariadic.h" 25 #include <algorithm> 26 #include <iterator> 27 #include <memory> 28 #include <numeric> 29 #include <random> 30 #include <vector> 31 32 #define DEBUG_TYPE "bolt-opts" 33 34 using namespace llvm; 35 using namespace bolt; 36 37 namespace { 38 class DeprecatedSplitFunctionOptionParser : public cl::parser<bool> { 39 public: 40 explicit DeprecatedSplitFunctionOptionParser(cl::Option &O) 41 : cl::parser<bool>(O) {} 42 43 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, bool &Value) { 44 if (Arg == "2" || Arg == "3") { 45 Value = true; 46 errs() << formatv("BOLT-WARNING: specifying non-boolean value \"{0}\" " 47 "for option -{1} is deprecated\n", 48 Arg, ArgName); 49 return false; 50 } 51 return cl::parser<bool>::parse(O, ArgName, Arg, Value); 52 } 53 }; 54 } // namespace 55 56 namespace opts { 57 58 extern cl::OptionCategory BoltOptCategory; 59 60 extern cl::opt<bool> SplitEH; 61 extern cl::opt<unsigned> ExecutionCountThreshold; 62 extern cl::opt<uint32_t> RandomSeed; 63 64 static cl::opt<bool> AggressiveSplitting( 65 "split-all-cold", cl::desc("outline as many cold basic blocks as possible"), 66 cl::cat(BoltOptCategory)); 67 68 static cl::opt<unsigned> SplitAlignThreshold( 69 "split-align-threshold", 70 cl::desc("when deciding to split a function, apply this alignment " 71 "while doing the size comparison (see -split-threshold). " 72 "Default value: 2."), 73 cl::init(2), 74 75 cl::Hidden, cl::cat(BoltOptCategory)); 76 77 static cl::opt<bool, false, DeprecatedSplitFunctionOptionParser> 78 SplitFunctions("split-functions", 79 cl::desc("split functions into fragments"), 80 cl::cat(BoltOptCategory)); 81 82 static cl::opt<unsigned> SplitThreshold( 83 "split-threshold", 84 cl::desc("split function only if its main size is reduced by more than " 85 "given amount of bytes. Default value: 0, i.e. split iff the " 86 "size is reduced. Note that on some architectures the size can " 87 "increase after splitting."), 88 cl::init(0), cl::Hidden, cl::cat(BoltOptCategory)); 89 90 static cl::opt<SplitFunctionsStrategy> SplitStrategy( 91 "split-strategy", cl::init(SplitFunctionsStrategy::Profile2), 92 cl::values(clEnumValN(SplitFunctionsStrategy::Profile2, "profile2", 93 "split each function into a hot and cold fragment " 94 "using profiling information")), 95 cl::values(clEnumValN(SplitFunctionsStrategy::CDSplit, "cdsplit", 96 "split each function into a hot, warm, and cold " 97 "fragment using profiling information")), 98 cl::values(clEnumValN( 99 SplitFunctionsStrategy::Random2, "random2", 100 "split each function into a hot and cold fragment at a randomly chosen " 101 "split point (ignoring any available profiling information)")), 102 cl::values(clEnumValN( 103 SplitFunctionsStrategy::RandomN, "randomN", 104 "split each function into N fragments at a randomly chosen split " 105 "points (ignoring any available profiling information)")), 106 cl::values(clEnumValN( 107 SplitFunctionsStrategy::All, "all", 108 "split all basic blocks of each function into fragments such that each " 109 "fragment contains exactly a single basic block")), 110 cl::desc("strategy used to partition blocks into fragments"), 111 cl::cat(BoltOptCategory)); 112 113 static cl::opt<double> CallScale( 114 "call-scale", 115 cl::desc("Call score scale coefficient (when --split-strategy=cdsplit)"), 116 cl::init(0.95), cl::ReallyHidden, cl::cat(BoltOptCategory)); 117 118 static cl::opt<double> 119 CallPower("call-power", 120 cl::desc("Call score power (when --split-strategy=cdsplit)"), 121 cl::init(0.05), cl::ReallyHidden, cl::cat(BoltOptCategory)); 122 123 static cl::opt<double> 124 JumpPower("jump-power", 125 cl::desc("Jump score power (when --split-strategy=cdsplit)"), 126 cl::init(0.15), cl::ReallyHidden, cl::cat(BoltOptCategory)); 127 } // namespace opts 128 129 namespace { 130 bool hasFullProfile(const BinaryFunction &BF) { 131 return llvm::all_of(BF.blocks(), [](const BinaryBasicBlock &BB) { 132 return BB.getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE; 133 }); 134 } 135 136 bool allBlocksCold(const BinaryFunction &BF) { 137 return llvm::all_of(BF.blocks(), [](const BinaryBasicBlock &BB) { 138 return BB.getExecutionCount() == 0; 139 }); 140 } 141 142 struct SplitProfile2 final : public SplitStrategy { 143 bool canSplit(const BinaryFunction &BF) override { 144 return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF); 145 } 146 147 bool compactFragments() override { return true; } 148 149 void fragment(const BlockIt Start, const BlockIt End) override { 150 for (BinaryBasicBlock *const BB : llvm::make_range(Start, End)) { 151 if (BB->getExecutionCount() == 0) 152 BB->setFragmentNum(FragmentNum::cold()); 153 } 154 } 155 }; 156 157 struct SplitCacheDirected final : public SplitStrategy { 158 BinaryContext &BC; 159 using BasicBlockOrder = BinaryFunction::BasicBlockOrderType; 160 161 bool canSplit(const BinaryFunction &BF) override { 162 return BF.hasValidProfile() && hasFullProfile(BF) && !allBlocksCold(BF); 163 } 164 165 explicit SplitCacheDirected(BinaryContext &BC) : BC(BC) { 166 initializeAuxiliaryVariables(); 167 buildCallGraph(); 168 } 169 170 // When some functions are hot-warm split and others are hot-warm-cold split, 171 // we do not want to change the fragment numbers of the blocks in the hot-warm 172 // split functions. 173 bool compactFragments() override { return false; } 174 175 void fragment(const BlockIt Start, const BlockIt End) override { 176 BasicBlockOrder BlockOrder(Start, End); 177 BinaryFunction &BF = *BlockOrder.front()->getFunction(); 178 179 size_t BestSplitIndex = findSplitIndex(BF, BlockOrder); 180 181 // Assign fragments based on the computed best split index. 182 // All basic blocks with index up to the best split index become hot. 183 // All remaining blocks are warm / cold depending on if count is 184 // greater than zero or not. 185 for (size_t Index = 0; Index < BlockOrder.size(); Index++) { 186 BinaryBasicBlock *BB = BlockOrder[Index]; 187 if (Index <= BestSplitIndex) 188 BB->setFragmentNum(FragmentNum::main()); 189 else 190 BB->setFragmentNum(BB->getKnownExecutionCount() > 0 191 ? FragmentNum::warm() 192 : FragmentNum::cold()); 193 } 194 } 195 196 private: 197 struct CallInfo { 198 size_t Length; 199 size_t Count; 200 }; 201 202 struct SplitScore { 203 size_t SplitIndex; 204 size_t HotSizeReduction = 0; 205 double LocalScore = 0; 206 double CoverCallScore = 0; 207 }; 208 209 // Auxiliary variables used by the algorithm. 210 size_t TotalNumBlocks{0}; 211 size_t OrigHotSectionSize{0}; 212 DenseMap<const BinaryBasicBlock *, size_t> GlobalIndices; 213 DenseMap<const BinaryBasicBlock *, size_t> BBSizes; 214 DenseMap<const BinaryBasicBlock *, size_t> BBOffsets; 215 216 // Call graph. 217 std::vector<SmallVector<const BinaryBasicBlock *, 0>> Callers; 218 std::vector<SmallVector<const BinaryBasicBlock *, 0>> Callees; 219 220 bool shouldConsiderForCallGraph(const BinaryFunction &BF) { 221 // Only a subset of the functions in the binary will be considered 222 // for initializing auxiliary variables and building call graph. 223 return BF.hasValidIndex() && BF.hasValidProfile() && !BF.empty(); 224 } 225 226 void initializeAuxiliaryVariables() { 227 for (BinaryFunction *BF : BC.getSortedFunctions()) { 228 if (!shouldConsiderForCallGraph(*BF)) 229 continue; 230 231 // Calculate the size of each BB after hot-cold splitting. 232 // This populates BinaryBasicBlock::OutputAddressRange which 233 // can be used to compute the size of each BB. 234 BC.calculateEmittedSize(*BF, /*FixBranches=*/true); 235 236 for (BinaryBasicBlock *BB : BF->getLayout().blocks()) { 237 // Unique global index. 238 GlobalIndices[BB] = TotalNumBlocks; 239 TotalNumBlocks++; 240 241 // Block size after hot-cold splitting. 242 BBSizes[BB] = BB->getOutputSize(); 243 244 // Hot block offset after hot-cold splitting. 245 BBOffsets[BB] = OrigHotSectionSize; 246 if (!BB->isSplit()) 247 OrigHotSectionSize += BBSizes[BB]; 248 } 249 } 250 } 251 252 void buildCallGraph() { 253 Callers.resize(TotalNumBlocks); 254 Callees.resize(TotalNumBlocks); 255 for (const BinaryFunction *SrcFunction : BC.getSortedFunctions()) { 256 if (!shouldConsiderForCallGraph(*SrcFunction)) 257 continue; 258 259 for (BinaryBasicBlock &SrcBB : SrcFunction->blocks()) { 260 // Skip blocks that are not executed 261 if (SrcBB.getKnownExecutionCount() == 0) 262 continue; 263 264 // Find call instructions and extract target symbols from each one 265 for (const MCInst &Inst : SrcBB) { 266 if (!BC.MIB->isCall(Inst)) 267 continue; 268 269 // Call info 270 const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst); 271 // Ignore calls w/o information 272 if (!DstSym) 273 continue; 274 275 const BinaryFunction *DstFunction = BC.getFunctionForSymbol(DstSym); 276 // Ignore calls that do not have a valid target, but do not ignore 277 // recursive calls, because caller block could be moved to warm. 278 if (!DstFunction || DstFunction->getLayout().block_empty()) 279 continue; 280 281 const BinaryBasicBlock *DstBB = &(DstFunction->front()); 282 283 // Record the call only if DstBB is also in functions to consider for 284 // call graph. 285 if (GlobalIndices.contains(DstBB)) { 286 Callers[GlobalIndices[DstBB]].push_back(&SrcBB); 287 Callees[GlobalIndices[&SrcBB]].push_back(DstBB); 288 } 289 } 290 } 291 } 292 } 293 294 /// Populate BinaryBasicBlock::OutputAddressRange with estimated basic block 295 /// start and end addresses for hot and warm basic blocks, assuming hot-warm 296 /// splitting happens at \p SplitIndex. Also return estimated end addresses 297 /// of the hot fragment before and after splitting. 298 /// The estimations take into account the potential addition of branch 299 /// instructions due to split fall through branches as well as the need to 300 /// use longer branch instructions for split (un)conditional branches. 301 std::pair<size_t, size_t> 302 estimatePostSplitBBAddress(const BasicBlockOrder &BlockOrder, 303 const size_t SplitIndex) { 304 assert(SplitIndex < BlockOrder.size() && "Invalid split index"); 305 306 // Update function layout assuming hot-warm splitting at SplitIndex 307 for (size_t Index = 0; Index < BlockOrder.size(); Index++) { 308 BinaryBasicBlock *BB = BlockOrder[Index]; 309 if (BB->getFragmentNum() == FragmentNum::cold()) 310 break; 311 BB->setFragmentNum(Index <= SplitIndex ? FragmentNum::main() 312 : FragmentNum::warm()); 313 } 314 BinaryFunction *BF = BlockOrder[0]->getFunction(); 315 BF->getLayout().update(BlockOrder); 316 // Populate BB.OutputAddressRange under the updated layout. 317 BC.calculateEmittedSize(*BF); 318 319 // Populate BB.OutputAddressRange with estimated new start and end addresses 320 // and compute the old end address of the hot section and the new end 321 // address of the hot section. 322 size_t OldHotEndAddr; 323 size_t NewHotEndAddr; 324 size_t CurrentAddr = BBOffsets[BlockOrder[0]]; 325 for (BinaryBasicBlock *BB : BlockOrder) { 326 // We only care about new addresses of blocks in hot/warm. 327 if (BB->getFragmentNum() == FragmentNum::cold()) 328 break; 329 const size_t NewSize = BB->getOutputSize(); 330 BB->setOutputStartAddress(CurrentAddr); 331 CurrentAddr += NewSize; 332 BB->setOutputEndAddress(CurrentAddr); 333 if (BB->getLayoutIndex() == SplitIndex) { 334 NewHotEndAddr = CurrentAddr; 335 // Approximate the start address of the warm fragment of the current 336 // function using the original hot section size. 337 CurrentAddr = OrigHotSectionSize; 338 } 339 OldHotEndAddr = BBOffsets[BB] + BBSizes[BB]; 340 } 341 return std::make_pair(OldHotEndAddr, NewHotEndAddr); 342 } 343 344 /// Get a collection of "shortenable" calls, that is, calls of type X->Y 345 /// when the function order is [... X ... BF ... Y ...]. 346 /// If the hot fragment size of BF is reduced, then such calls are guaranteed 347 /// to get shorter by the reduced hot fragment size. 348 std::vector<CallInfo> extractCoverCalls(const BinaryFunction &BF) { 349 // Record the length and the count of the calls that can be shortened 350 std::vector<CallInfo> CoverCalls; 351 if (opts::CallScale == 0) 352 return CoverCalls; 353 354 const BinaryFunction *ThisBF = &BF; 355 const BinaryBasicBlock *ThisBB = &(ThisBF->front()); 356 const size_t ThisGI = GlobalIndices[ThisBB]; 357 358 for (const BinaryFunction *DstBF : BC.getSortedFunctions()) { 359 if (!shouldConsiderForCallGraph(*DstBF)) 360 continue; 361 362 const BinaryBasicBlock *DstBB = &(DstBF->front()); 363 if (DstBB->getKnownExecutionCount() == 0) 364 continue; 365 366 const size_t DstGI = GlobalIndices[DstBB]; 367 for (const BinaryBasicBlock *SrcBB : Callers[DstGI]) { 368 const BinaryFunction *SrcBF = SrcBB->getFunction(); 369 if (ThisBF == SrcBF) 370 continue; 371 372 const size_t CallCount = SrcBB->getKnownExecutionCount(); 373 374 const size_t SrcGI = GlobalIndices[SrcBB]; 375 376 const bool IsCoverCall = (SrcGI < ThisGI && ThisGI < DstGI) || 377 (DstGI <= ThisGI && ThisGI < SrcGI); 378 if (!IsCoverCall) 379 continue; 380 381 const size_t SrcBBEndAddr = BBOffsets[SrcBB] + BBSizes[SrcBB]; 382 const size_t DstBBStartAddr = BBOffsets[DstBB]; 383 const size_t CallLength = 384 AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr); 385 const CallInfo CI{CallLength, CallCount}; 386 CoverCalls.emplace_back(CI); 387 } 388 } 389 return CoverCalls; 390 } 391 392 /// Compute the edge score of a call edge. 393 double computeCallScore(uint64_t CallCount, size_t CallLength) { 394 // Increase call lengths by 1 to avoid raising 0 to a negative power. 395 return opts::CallScale * static_cast<double>(CallCount) / 396 std::pow(static_cast<double>(CallLength + 1), opts::CallPower); 397 } 398 399 /// Compute the edge score of a jump (branch) edge. 400 double computeJumpScore(uint64_t JumpCount, size_t JumpLength) { 401 // Increase jump lengths by 1 to avoid raising 0 to a negative power. 402 return static_cast<double>(JumpCount) / 403 std::pow(static_cast<double>(JumpLength + 1), opts::JumpPower); 404 } 405 406 /// Compute sum of scores over jumps within \p BlockOrder given \p SplitIndex. 407 /// Increament Score.LocalScore in place by the sum. 408 void computeJumpScore(const BasicBlockOrder &BlockOrder, 409 const size_t SplitIndex, SplitScore &Score) { 410 411 for (const BinaryBasicBlock *SrcBB : BlockOrder) { 412 if (SrcBB->getKnownExecutionCount() == 0) 413 continue; 414 415 const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second; 416 417 for (const auto Pair : zip(SrcBB->successors(), SrcBB->branch_info())) { 418 const BinaryBasicBlock *DstBB = std::get<0>(Pair); 419 const BinaryBasicBlock::BinaryBranchInfo &Branch = std::get<1>(Pair); 420 const size_t JumpCount = Branch.Count; 421 422 if (JumpCount == 0) 423 continue; 424 425 const size_t DstBBStartAddr = DstBB->getOutputAddressRange().first; 426 const size_t NewJumpLength = 427 AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr); 428 Score.LocalScore += computeJumpScore(JumpCount, NewJumpLength); 429 } 430 } 431 } 432 433 /// Compute sum of scores over calls originated in the current function 434 /// given \p SplitIndex. Increament Score.LocalScore in place by the sum. 435 void computeLocalCallScore(const BasicBlockOrder &BlockOrder, 436 const size_t SplitIndex, SplitScore &Score) { 437 if (opts::CallScale == 0) 438 return; 439 440 // Global index of the last block in the current function. 441 // This is later used to determine whether a call originated in the current 442 // function is to a function that comes after the current function. 443 const size_t LastGlobalIndex = GlobalIndices[BlockOrder.back()]; 444 445 // The length of calls originated in the input function can increase / 446 // decrease depending on the splitting decision. 447 for (const BinaryBasicBlock *SrcBB : BlockOrder) { 448 const size_t CallCount = SrcBB->getKnownExecutionCount(); 449 // If SrcBB does not call any functions, skip it. 450 if (CallCount == 0) 451 continue; 452 453 // Obtain an estimate on the end address of the src basic block 454 // after splitting at SplitIndex. 455 const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second; 456 457 for (const BinaryBasicBlock *DstBB : Callees[GlobalIndices[SrcBB]]) { 458 // Obtain an estimate on the start address of the dst basic block 459 // after splitting at SplitIndex. If DstBB is in a function before 460 // the current function, then its start address remains unchanged. 461 size_t DstBBStartAddr = BBOffsets[DstBB]; 462 // If DstBB is in a function after the current function, then its 463 // start address should be adjusted based on the reduction in hot size. 464 if (GlobalIndices[DstBB] > LastGlobalIndex) { 465 assert(DstBBStartAddr >= Score.HotSizeReduction); 466 DstBBStartAddr -= Score.HotSizeReduction; 467 } 468 const size_t NewCallLength = 469 AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr); 470 Score.LocalScore += computeCallScore(CallCount, NewCallLength); 471 } 472 } 473 } 474 475 /// Compute sum of splitting scores for cover calls of the input function. 476 /// Increament Score.CoverCallScore in place by the sum. 477 void computeCoverCallScore(const BasicBlockOrder &BlockOrder, 478 const size_t SplitIndex, 479 const std::vector<CallInfo> &CoverCalls, 480 SplitScore &Score) { 481 if (opts::CallScale == 0) 482 return; 483 484 for (const CallInfo CI : CoverCalls) { 485 assert(CI.Length >= Score.HotSizeReduction && 486 "Length of cover calls must exceed reduced size of hot fragment."); 487 // Compute the new length of the call, which is shorter than the original 488 // one by the size of the splitted fragment minus the total size increase. 489 const size_t NewCallLength = CI.Length - Score.HotSizeReduction; 490 Score.CoverCallScore += computeCallScore(CI.Count, NewCallLength); 491 } 492 } 493 494 /// Compute the split score of splitting a function at a given index. 495 /// The split score consists of local score and cover score. Cover call score 496 /// is expensive to compute. As a result, we pass in a \p ReferenceScore and 497 /// compute cover score only when the local score exceeds that in the 498 /// ReferenceScore or that the size reduction of the hot fragment is larger 499 /// than that achieved by the split index of the ReferenceScore. This function 500 /// returns \p Score of SplitScore type. It contains the local score and cover 501 /// score (if computed) of the current splitting index. For easier book 502 /// keeping and comparison, it also stores the split index and the resulting 503 /// reduction in hot fragment size. 504 SplitScore computeSplitScore(const BinaryFunction &BF, 505 const BasicBlockOrder &BlockOrder, 506 const size_t SplitIndex, 507 const std::vector<CallInfo> &CoverCalls, 508 const SplitScore &ReferenceScore) { 509 // Populate BinaryBasicBlock::OutputAddressRange with estimated 510 // new start and end addresses after hot-warm splitting at SplitIndex. 511 size_t OldHotEnd; 512 size_t NewHotEnd; 513 std::tie(OldHotEnd, NewHotEnd) = 514 estimatePostSplitBBAddress(BlockOrder, SplitIndex); 515 516 SplitScore Score; 517 Score.SplitIndex = SplitIndex; 518 519 // It's not worth splitting if OldHotEnd < NewHotEnd. 520 if (OldHotEnd < NewHotEnd) 521 return Score; 522 523 // Hot fragment size reduction due to splitting. 524 Score.HotSizeReduction = OldHotEnd - NewHotEnd; 525 526 // First part of LocalScore is the sum over call edges originated in the 527 // input function. These edges can get shorter or longer depending on 528 // SplitIndex. Score.LocalScore is increamented in place. 529 computeLocalCallScore(BlockOrder, SplitIndex, Score); 530 531 // Second part of LocalScore is the sum over jump edges with src basic block 532 // and dst basic block in the current function. Score.LocalScore is 533 // increamented in place. 534 computeJumpScore(BlockOrder, SplitIndex, Score); 535 536 // There is no need to compute CoverCallScore if we have already found 537 // another split index with a bigger LocalScore and bigger HotSizeReduction. 538 if (Score.LocalScore <= ReferenceScore.LocalScore && 539 Score.HotSizeReduction <= ReferenceScore.HotSizeReduction) 540 return Score; 541 542 // Compute CoverCallScore and store in Score in place. 543 computeCoverCallScore(BlockOrder, SplitIndex, CoverCalls, Score); 544 return Score; 545 } 546 547 /// Find the best index for splitting. The returned value is the index of the 548 /// last hot basic block. Hence, "no splitting" is equivalent to returning the 549 /// value which is one less than the size of the function. 550 size_t findSplitIndex(const BinaryFunction &BF, 551 const BasicBlockOrder &BlockOrder) { 552 // Find all function calls that can be shortened if we move blocks of the 553 // current function to warm/cold 554 const std::vector<CallInfo> CoverCalls = extractCoverCalls(BF); 555 556 // Try all possible split indices (blocks with Index <= SplitIndex are in 557 // hot) and find the one maximizing the splitting score. 558 SplitScore BestScore; 559 double BestScoreSum = -1.0; 560 SplitScore ReferenceScore; 561 for (size_t Index = 0; Index < BlockOrder.size(); Index++) { 562 const BinaryBasicBlock *LastHotBB = BlockOrder[Index]; 563 // No need to keep cold blocks in the hot section. 564 if (LastHotBB->getFragmentNum() == FragmentNum::cold()) 565 break; 566 const SplitScore Score = 567 computeSplitScore(BF, BlockOrder, Index, CoverCalls, ReferenceScore); 568 double ScoreSum = Score.LocalScore + Score.CoverCallScore; 569 if (ScoreSum > BestScoreSum) { 570 BestScoreSum = ScoreSum; 571 BestScore = Score; 572 } 573 if (Score.LocalScore > ReferenceScore.LocalScore) 574 ReferenceScore = Score; 575 } 576 577 return BestScore.SplitIndex; 578 } 579 }; 580 581 struct SplitRandom2 final : public SplitStrategy { 582 std::minstd_rand0 Gen; 583 584 SplitRandom2() : Gen(opts::RandomSeed.getValue()) {} 585 586 bool canSplit(const BinaryFunction &BF) override { return true; } 587 588 bool compactFragments() override { return true; } 589 590 void fragment(const BlockIt Start, const BlockIt End) override { 591 using DiffT = typename std::iterator_traits<BlockIt>::difference_type; 592 const DiffT NumBlocks = End - Start; 593 assert(NumBlocks > 0 && "Cannot fragment empty function"); 594 595 // We want to split at least one block 596 const auto LastSplitPoint = std::max<DiffT>(NumBlocks - 1, 1); 597 std::uniform_int_distribution<DiffT> Dist(1, LastSplitPoint); 598 const DiffT SplitPoint = Dist(Gen); 599 for (BinaryBasicBlock *BB : llvm::make_range(Start + SplitPoint, End)) 600 BB->setFragmentNum(FragmentNum::cold()); 601 602 LLVM_DEBUG(dbgs() << formatv("BOLT-DEBUG: randomly chose last {0} (out of " 603 "{1} possible) blocks to split\n", 604 NumBlocks - SplitPoint, End - Start)); 605 } 606 }; 607 608 struct SplitRandomN final : public SplitStrategy { 609 std::minstd_rand0 Gen; 610 611 SplitRandomN() : Gen(opts::RandomSeed.getValue()) {} 612 613 bool canSplit(const BinaryFunction &BF) override { return true; } 614 615 bool compactFragments() override { return true; } 616 617 void fragment(const BlockIt Start, const BlockIt End) override { 618 using DiffT = typename std::iterator_traits<BlockIt>::difference_type; 619 const DiffT NumBlocks = End - Start; 620 assert(NumBlocks > 0 && "Cannot fragment empty function"); 621 622 // With n blocks, there are n-1 places to split them. 623 const DiffT MaximumSplits = NumBlocks - 1; 624 // We want to generate at least two fragment if possible, but if there is 625 // only one block, no splits are possible. 626 const auto MinimumSplits = std::min<DiffT>(MaximumSplits, 1); 627 std::uniform_int_distribution<DiffT> Dist(MinimumSplits, MaximumSplits); 628 // Choose how many splits to perform 629 const DiffT NumSplits = Dist(Gen); 630 631 // Draw split points from a lottery 632 SmallVector<unsigned, 0> Lottery(MaximumSplits); 633 // Start lottery at 1, because there is no meaningful splitpoint before the 634 // first block. 635 std::iota(Lottery.begin(), Lottery.end(), 1u); 636 std::shuffle(Lottery.begin(), Lottery.end(), Gen); 637 Lottery.resize(NumSplits); 638 llvm::sort(Lottery); 639 640 // Add one past the end entry to lottery 641 Lottery.push_back(NumBlocks); 642 643 unsigned LotteryIndex = 0; 644 unsigned BBPos = 0; 645 for (BinaryBasicBlock *const BB : make_range(Start, End)) { 646 // Check whether to start new fragment 647 if (BBPos >= Lottery[LotteryIndex]) 648 ++LotteryIndex; 649 650 // Because LotteryIndex is 0 based and cold fragments are 1 based, we can 651 // use the index to assign fragments. 652 BB->setFragmentNum(FragmentNum(LotteryIndex)); 653 654 ++BBPos; 655 } 656 } 657 }; 658 659 struct SplitAll final : public SplitStrategy { 660 bool canSplit(const BinaryFunction &BF) override { return true; } 661 662 bool compactFragments() override { 663 // Keeping empty fragments allows us to test, that empty fragments do not 664 // generate symbols. 665 return false; 666 } 667 668 void fragment(const BlockIt Start, const BlockIt End) override { 669 unsigned Fragment = 0; 670 for (BinaryBasicBlock *const BB : llvm::make_range(Start, End)) 671 BB->setFragmentNum(FragmentNum(Fragment++)); 672 } 673 }; 674 } // namespace 675 676 namespace llvm { 677 namespace bolt { 678 679 bool SplitFunctions::shouldOptimize(const BinaryFunction &BF) const { 680 // Apply execution count threshold 681 if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold) 682 return false; 683 684 return BinaryFunctionPass::shouldOptimize(BF); 685 } 686 687 void SplitFunctions::runOnFunctions(BinaryContext &BC) { 688 if (!opts::SplitFunctions) 689 return; 690 691 // If split strategy is not CDSplit, then a second run of the pass is not 692 // needed after function reordering. 693 if (BC.HasFinalizedFunctionOrder && 694 opts::SplitStrategy != SplitFunctionsStrategy::CDSplit) 695 return; 696 697 std::unique_ptr<SplitStrategy> Strategy; 698 bool ForceSequential = false; 699 700 switch (opts::SplitStrategy) { 701 case SplitFunctionsStrategy::CDSplit: 702 // CDSplit runs two splitting passes: hot-cold splitting (SplitPrfoile2) 703 // before function reordering and hot-warm-cold splitting 704 // (SplitCacheDirected) after function reordering. 705 if (BC.HasFinalizedFunctionOrder) 706 Strategy = std::make_unique<SplitCacheDirected>(BC); 707 else 708 Strategy = std::make_unique<SplitProfile2>(); 709 opts::AggressiveSplitting = true; 710 BC.HasWarmSection = true; 711 break; 712 case SplitFunctionsStrategy::Profile2: 713 Strategy = std::make_unique<SplitProfile2>(); 714 break; 715 case SplitFunctionsStrategy::Random2: 716 Strategy = std::make_unique<SplitRandom2>(); 717 // If we split functions randomly, we need to ensure that across runs with 718 // the same input, we generate random numbers for each function in the same 719 // order. 720 ForceSequential = true; 721 break; 722 case SplitFunctionsStrategy::RandomN: 723 Strategy = std::make_unique<SplitRandomN>(); 724 ForceSequential = true; 725 break; 726 case SplitFunctionsStrategy::All: 727 Strategy = std::make_unique<SplitAll>(); 728 break; 729 } 730 731 ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) { 732 return !shouldOptimize(BF); 733 }; 734 735 ParallelUtilities::runOnEachFunction( 736 BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR, 737 [&](BinaryFunction &BF) { splitFunction(BF, *Strategy); }, SkipFunc, 738 "SplitFunctions", ForceSequential); 739 740 if (SplitBytesHot + SplitBytesCold > 0) 741 outs() << "BOLT-INFO: splitting separates " << SplitBytesHot 742 << " hot bytes from " << SplitBytesCold << " cold bytes " 743 << format("(%.2lf%% of split functions is hot).\n", 744 100.0 * SplitBytesHot / (SplitBytesHot + SplitBytesCold)); 745 } 746 747 void SplitFunctions::splitFunction(BinaryFunction &BF, SplitStrategy &S) { 748 if (BF.empty()) 749 return; 750 751 if (!S.canSplit(BF)) 752 return; 753 754 FunctionLayout &Layout = BF.getLayout(); 755 BinaryFunction::BasicBlockOrderType PreSplitLayout(Layout.block_begin(), 756 Layout.block_end()); 757 758 BinaryContext &BC = BF.getBinaryContext(); 759 size_t OriginalHotSize; 760 size_t HotSize; 761 size_t ColdSize; 762 if (BC.isX86()) { 763 std::tie(OriginalHotSize, ColdSize) = BC.calculateEmittedSize(BF); 764 LLVM_DEBUG(dbgs() << "Estimated size for function " << BF 765 << " pre-split is <0x" 766 << Twine::utohexstr(OriginalHotSize) << ", 0x" 767 << Twine::utohexstr(ColdSize) << ">\n"); 768 } 769 770 BinaryFunction::BasicBlockOrderType NewLayout(Layout.block_begin(), 771 Layout.block_end()); 772 // Never outline the first basic block. 773 NewLayout.front()->setCanOutline(false); 774 for (BinaryBasicBlock *const BB : NewLayout) { 775 if (!BB->canOutline()) 776 continue; 777 778 // Do not split extra entry points in aarch64. They can be referred by 779 // using ADRs and when this happens, these blocks cannot be placed far 780 // away due to the limited range in ADR instruction. 781 if (BC.isAArch64() && BB->isEntryPoint()) { 782 BB->setCanOutline(false); 783 continue; 784 } 785 786 if (BF.hasEHRanges() && !opts::SplitEH) { 787 // We cannot move landing pads (or rather entry points for landing pads). 788 if (BB->isLandingPad()) { 789 BB->setCanOutline(false); 790 continue; 791 } 792 // We cannot move a block that can throw since exception-handling 793 // runtime cannot deal with split functions. However, if we can guarantee 794 // that the block never throws, it is safe to move the block to 795 // decrease the size of the function. 796 for (MCInst &Instr : *BB) { 797 if (BC.MIB->isInvoke(Instr)) { 798 BB->setCanOutline(false); 799 break; 800 } 801 } 802 } 803 } 804 805 BF.getLayout().updateLayoutIndices(); 806 S.fragment(NewLayout.begin(), NewLayout.end()); 807 808 // Make sure all non-outlineable blocks are in the main-fragment. 809 for (BinaryBasicBlock *const BB : NewLayout) { 810 if (!BB->canOutline()) 811 BB->setFragmentNum(FragmentNum::main()); 812 } 813 814 if (opts::AggressiveSplitting) { 815 // All blocks with 0 count that we can move go to the end of the function. 816 // Even if they were natural to cluster formation and were seen in-between 817 // hot basic blocks. 818 llvm::stable_sort(NewLayout, [&](const BinaryBasicBlock *const A, 819 const BinaryBasicBlock *const B) { 820 return A->getFragmentNum() < B->getFragmentNum(); 821 }); 822 } else if (BF.hasEHRanges() && !opts::SplitEH) { 823 // Typically functions with exception handling have landing pads at the end. 824 // We cannot move beginning of landing pads, but we can move 0-count blocks 825 // comprising landing pads to the end and thus facilitate splitting. 826 auto FirstLP = NewLayout.begin(); 827 while ((*FirstLP)->isLandingPad()) 828 ++FirstLP; 829 830 std::stable_sort(FirstLP, NewLayout.end(), 831 [&](BinaryBasicBlock *A, BinaryBasicBlock *B) { 832 return A->getFragmentNum() < B->getFragmentNum(); 833 }); 834 } 835 836 // Make sure that fragments are increasing. 837 FragmentNum CurrentFragment = NewLayout.back()->getFragmentNum(); 838 for (BinaryBasicBlock *const BB : reverse(NewLayout)) { 839 if (BB->getFragmentNum() > CurrentFragment) 840 BB->setFragmentNum(CurrentFragment); 841 CurrentFragment = BB->getFragmentNum(); 842 } 843 844 if (S.compactFragments()) { 845 FragmentNum CurrentFragment = FragmentNum::main(); 846 FragmentNum NewFragment = FragmentNum::main(); 847 for (BinaryBasicBlock *const BB : NewLayout) { 848 if (BB->getFragmentNum() > CurrentFragment) { 849 CurrentFragment = BB->getFragmentNum(); 850 NewFragment = FragmentNum(NewFragment.get() + 1); 851 } 852 BB->setFragmentNum(NewFragment); 853 } 854 } 855 856 const bool LayoutUpdated = BF.getLayout().update(NewLayout); 857 858 // For shared objects, invoke instructions and corresponding landing pads 859 // have to be placed in the same fragment. When we split them, create 860 // trampoline landing pads that will redirect the execution to real LPs. 861 TrampolineSetType Trampolines; 862 if (!BC.HasFixedLoadAddress && BF.hasEHRanges() && BF.isSplit()) 863 Trampolines = createEHTrampolines(BF); 864 865 // Check the new size to see if it's worth splitting the function. 866 if (BC.isX86() && LayoutUpdated) { 867 std::tie(HotSize, ColdSize) = BC.calculateEmittedSize(BF); 868 LLVM_DEBUG(dbgs() << "Estimated size for function " << BF 869 << " post-split is <0x" << Twine::utohexstr(HotSize) 870 << ", 0x" << Twine::utohexstr(ColdSize) << ">\n"); 871 if (alignTo(OriginalHotSize, opts::SplitAlignThreshold) <= 872 alignTo(HotSize, opts::SplitAlignThreshold) + opts::SplitThreshold) { 873 if (opts::Verbosity >= 2) { 874 outs() << "BOLT-INFO: Reversing splitting of function " 875 << formatv("{0}:\n {1:x}, {2:x} -> {3:x}\n", BF, HotSize, 876 ColdSize, OriginalHotSize); 877 } 878 879 // Reverse the action of createEHTrampolines(). The trampolines will be 880 // placed immediately before the matching destination resulting in no 881 // extra code. 882 if (PreSplitLayout.size() != BF.size()) 883 PreSplitLayout = mergeEHTrampolines(BF, PreSplitLayout, Trampolines); 884 885 for (BinaryBasicBlock &BB : BF) 886 BB.setFragmentNum(FragmentNum::main()); 887 BF.getLayout().update(PreSplitLayout); 888 } else { 889 SplitBytesHot += HotSize; 890 SplitBytesCold += ColdSize; 891 } 892 } 893 894 // Fix branches if the splitting decision of the pass after function 895 // reordering is different from that of the pass before function reordering. 896 if (LayoutUpdated && BC.HasFinalizedFunctionOrder) 897 BF.fixBranches(); 898 } 899 900 SplitFunctions::TrampolineSetType 901 SplitFunctions::createEHTrampolines(BinaryFunction &BF) const { 902 const auto &MIB = BF.getBinaryContext().MIB; 903 904 // Map real landing pads to the corresponding trampolines. 905 TrampolineSetType LPTrampolines; 906 907 // Iterate over the copy of basic blocks since we are adding new blocks to the 908 // function which will invalidate its iterators. 909 std::vector<BinaryBasicBlock *> Blocks(BF.pbegin(), BF.pend()); 910 for (BinaryBasicBlock *BB : Blocks) { 911 for (MCInst &Instr : *BB) { 912 const std::optional<MCPlus::MCLandingPad> EHInfo = MIB->getEHInfo(Instr); 913 if (!EHInfo || !EHInfo->first) 914 continue; 915 916 const MCSymbol *LPLabel = EHInfo->first; 917 BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(LPLabel); 918 if (BB->getFragmentNum() == LPBlock->getFragmentNum()) 919 continue; 920 921 const MCSymbol *TrampolineLabel = nullptr; 922 const TrampolineKey Key(BB->getFragmentNum(), LPLabel); 923 auto Iter = LPTrampolines.find(Key); 924 if (Iter != LPTrampolines.end()) { 925 TrampolineLabel = Iter->second; 926 } else { 927 // Create a trampoline basic block in the same fragment as the thrower. 928 // Note: there's no need to insert the jump instruction, it will be 929 // added by fixBranches(). 930 BinaryBasicBlock *TrampolineBB = BF.addBasicBlock(); 931 TrampolineBB->setFragmentNum(BB->getFragmentNum()); 932 TrampolineBB->setExecutionCount(LPBlock->getExecutionCount()); 933 TrampolineBB->addSuccessor(LPBlock, TrampolineBB->getExecutionCount()); 934 TrampolineBB->setCFIState(LPBlock->getCFIState()); 935 TrampolineLabel = TrampolineBB->getLabel(); 936 LPTrampolines.insert(std::make_pair(Key, TrampolineLabel)); 937 } 938 939 // Substitute the landing pad with the trampoline. 940 MIB->updateEHInfo(Instr, 941 MCPlus::MCLandingPad(TrampolineLabel, EHInfo->second)); 942 } 943 } 944 945 if (LPTrampolines.empty()) 946 return LPTrampolines; 947 948 // All trampoline blocks were added to the end of the function. Place them at 949 // the end of corresponding fragments. 950 BinaryFunction::BasicBlockOrderType NewLayout(BF.getLayout().block_begin(), 951 BF.getLayout().block_end()); 952 stable_sort(NewLayout, [&](BinaryBasicBlock *A, BinaryBasicBlock *B) { 953 return A->getFragmentNum() < B->getFragmentNum(); 954 }); 955 BF.getLayout().update(NewLayout); 956 957 // Conservatively introduce branch instructions. 958 BF.fixBranches(); 959 960 // Update exception-handling CFG for the function. 961 BF.recomputeLandingPads(); 962 963 return LPTrampolines; 964 } 965 966 SplitFunctions::BasicBlockOrderType SplitFunctions::mergeEHTrampolines( 967 BinaryFunction &BF, SplitFunctions::BasicBlockOrderType &Layout, 968 const SplitFunctions::TrampolineSetType &Trampolines) const { 969 DenseMap<const MCSymbol *, SmallVector<const MCSymbol *, 0>> 970 IncomingTrampolines; 971 for (const auto &Entry : Trampolines) { 972 IncomingTrampolines[Entry.getFirst().Target].emplace_back( 973 Entry.getSecond()); 974 } 975 976 BasicBlockOrderType MergedLayout; 977 for (BinaryBasicBlock *BB : Layout) { 978 auto Iter = IncomingTrampolines.find(BB->getLabel()); 979 if (Iter != IncomingTrampolines.end()) { 980 for (const MCSymbol *const Trampoline : Iter->getSecond()) { 981 BinaryBasicBlock *LPBlock = BF.getBasicBlockForLabel(Trampoline); 982 assert(LPBlock && "Could not find matching landing pad block."); 983 MergedLayout.push_back(LPBlock); 984 } 985 } 986 MergedLayout.push_back(BB); 987 } 988 989 return MergedLayout; 990 } 991 992 } // namespace bolt 993 } // namespace llvm 994