1 //===- bolt/Passes/ReorderAlgorithm.cpp - Basic block reordering ----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements classes used by several basic block reordering 10 // algorithms. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "bolt/Passes/ReorderAlgorithm.h" 15 #include "bolt/Core/BinaryBasicBlock.h" 16 #include "bolt/Core/BinaryFunction.h" 17 #include "llvm/Support/CommandLine.h" 18 #include "llvm/Transforms/Utils/CodeLayout.h" 19 #include <queue> 20 #include <random> 21 #include <stack> 22 23 #undef DEBUG_TYPE 24 #define DEBUG_TYPE "bolt" 25 26 using namespace llvm; 27 using namespace bolt; 28 29 namespace opts { 30 31 extern cl::OptionCategory BoltOptCategory; 32 extern cl::opt<bool> NoThreads; 33 34 static cl::opt<unsigned> ColdThreshold( 35 "cold-threshold", 36 cl::desc("tenths of percents of main entry frequency to use as a " 37 "threshold when evaluating whether a basic block is cold " 38 "(0 means it is only considered cold if the block has zero " 39 "samples). Default: 0 "), 40 cl::init(0), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory)); 41 42 static cl::opt<bool> PrintClusters("print-clusters", cl::desc("print clusters"), 43 cl::Hidden, cl::cat(BoltOptCategory)); 44 45 cl::opt<uint32_t> RandomSeed("bolt-seed", cl::desc("seed for randomization"), 46 cl::init(42), cl::Hidden, 47 cl::cat(BoltOptCategory)); 48 49 } // namespace opts 50 51 namespace { 52 53 template <class T> inline void hashCombine(size_t &Seed, const T &Val) { 54 std::hash<T> Hasher; 55 Seed ^= Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2); 56 } 57 58 template <typename A, typename B> struct HashPair { 59 size_t operator()(const std::pair<A, B> &Val) const { 60 std::hash<A> Hasher; 61 size_t Seed = Hasher(Val.first); 62 hashCombine(Seed, Val.second); 63 return Seed; 64 } 65 }; 66 67 } // namespace 68 69 void ClusterAlgorithm::computeClusterAverageFrequency(const BinaryContext &BC) { 70 // Create a separate MCCodeEmitter to allow lock-free execution 71 BinaryContext::IndependentCodeEmitter Emitter; 72 if (!opts::NoThreads) 73 Emitter = BC.createIndependentMCCodeEmitter(); 74 75 AvgFreq.resize(Clusters.size(), 0.0); 76 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { 77 double Freq = 0.0; 78 uint64_t ClusterSize = 0; 79 for (const BinaryBasicBlock *BB : Clusters[I]) { 80 if (BB->getNumNonPseudos() > 0) { 81 Freq += BB->getExecutionCount(); 82 // Estimate the size of a block in bytes at run time 83 // NOTE: This might be inaccurate 84 ClusterSize += BB->estimateSize(Emitter.MCE.get()); 85 } 86 } 87 AvgFreq[I] = ClusterSize == 0 ? 0 : Freq / ClusterSize; 88 } 89 } 90 91 void ClusterAlgorithm::printClusters() const { 92 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { 93 errs() << "Cluster number " << I; 94 if (AvgFreq.size() == Clusters.size()) 95 errs() << " (frequency: " << AvgFreq[I] << ")"; 96 errs() << " : "; 97 const char *Sep = ""; 98 for (const BinaryBasicBlock *BB : Clusters[I]) { 99 errs() << Sep << BB->getName(); 100 Sep = ", "; 101 } 102 errs() << "\n"; 103 } 104 } 105 106 void ClusterAlgorithm::reset() { 107 Clusters.clear(); 108 ClusterEdges.clear(); 109 AvgFreq.clear(); 110 } 111 112 void GreedyClusterAlgorithm::EdgeTy::print(raw_ostream &OS) const { 113 OS << Src->getName() << " -> " << Dst->getName() << ", count: " << Count; 114 } 115 116 size_t GreedyClusterAlgorithm::EdgeHash::operator()(const EdgeTy &E) const { 117 HashPair<const BinaryBasicBlock *, const BinaryBasicBlock *> Hasher; 118 return Hasher(std::make_pair(E.Src, E.Dst)); 119 } 120 121 bool GreedyClusterAlgorithm::EdgeEqual::operator()(const EdgeTy &A, 122 const EdgeTy &B) const { 123 return A.Src == B.Src && A.Dst == B.Dst; 124 } 125 126 void GreedyClusterAlgorithm::clusterBasicBlocks(BinaryFunction &BF, 127 bool ComputeEdges) { 128 reset(); 129 130 // Greedy heuristic implementation for the TSP, applied to BB layout. Try to 131 // maximize weight during a path traversing all BBs. In this way, we will 132 // convert the hottest branches into fall-throughs. 133 134 // This is the queue of edges from which we will pop edges and use them to 135 // cluster basic blocks in a greedy fashion. 136 std::vector<EdgeTy> Queue; 137 138 // Initialize inter-cluster weights. 139 if (ComputeEdges) 140 ClusterEdges.resize(BF.getLayout().block_size()); 141 142 // Initialize clusters and edge queue. 143 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { 144 // Create a cluster for this BB. 145 uint32_t I = Clusters.size(); 146 Clusters.emplace_back(); 147 std::vector<BinaryBasicBlock *> &Cluster = Clusters.back(); 148 Cluster.push_back(BB); 149 BBToClusterMap[BB] = I; 150 // Populate priority queue with edges. 151 auto BI = BB->branch_info_begin(); 152 for (const BinaryBasicBlock *I : BB->successors()) { 153 assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && 154 "attempted reordering blocks of function with no profile data"); 155 Queue.emplace_back(EdgeTy(BB, I, BI->Count)); 156 ++BI; 157 } 158 } 159 // Sort and adjust the edge queue. 160 initQueue(Queue, BF); 161 162 // Grow clusters in a greedy fashion. 163 while (!Queue.empty()) { 164 EdgeTy E = Queue.back(); 165 Queue.pop_back(); 166 167 const BinaryBasicBlock *SrcBB = E.Src; 168 const BinaryBasicBlock *DstBB = E.Dst; 169 170 LLVM_DEBUG(dbgs() << "Popped edge "; E.print(dbgs()); dbgs() << "\n"); 171 172 // Case 1: BBSrc and BBDst are the same. Ignore this edge 173 if (SrcBB == DstBB || DstBB == *BF.getLayout().block_begin()) { 174 LLVM_DEBUG(dbgs() << "\tIgnored (same src, dst)\n"); 175 continue; 176 } 177 178 int I = BBToClusterMap[SrcBB]; 179 int J = BBToClusterMap[DstBB]; 180 181 // Case 2: If they are already allocated at the same cluster, just increase 182 // the weight of this cluster 183 if (I == J) { 184 if (ComputeEdges) 185 ClusterEdges[I][I] += E.Count; 186 LLVM_DEBUG(dbgs() << "\tIgnored (src, dst belong to the same cluster)\n"); 187 continue; 188 } 189 190 std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I]; 191 std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J]; 192 if (areClustersCompatible(ClusterA, ClusterB, E)) { 193 // Case 3: SrcBB is at the end of a cluster and DstBB is at the start, 194 // allowing us to merge two clusters. 195 for (const BinaryBasicBlock *BB : ClusterB) 196 BBToClusterMap[BB] = I; 197 ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end()); 198 ClusterB.clear(); 199 if (ComputeEdges) { 200 // Increase the intra-cluster edge count of cluster A with the count of 201 // this edge as well as with the total count of previously visited edges 202 // from cluster B cluster A. 203 ClusterEdges[I][I] += E.Count; 204 ClusterEdges[I][I] += ClusterEdges[J][I]; 205 // Iterate through all inter-cluster edges and transfer edges targeting 206 // cluster B to cluster A. 207 for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K) 208 ClusterEdges[K][I] += ClusterEdges[K][J]; 209 } 210 // Adjust the weights of the remaining edges and re-sort the queue. 211 adjustQueue(Queue, BF); 212 LLVM_DEBUG(dbgs() << "\tMerged clusters of src, dst\n"); 213 } else { 214 // Case 4: Both SrcBB and DstBB are allocated in positions we cannot 215 // merge them. Add the count of this edge to the inter-cluster edge count 216 // between clusters A and B to help us decide ordering between these 217 // clusters. 218 if (ComputeEdges) 219 ClusterEdges[I][J] += E.Count; 220 LLVM_DEBUG( 221 dbgs() << "\tIgnored (src, dst belong to incompatible clusters)\n"); 222 } 223 } 224 } 225 226 void GreedyClusterAlgorithm::reset() { 227 ClusterAlgorithm::reset(); 228 BBToClusterMap.clear(); 229 } 230 231 void PHGreedyClusterAlgorithm::initQueue(std::vector<EdgeTy> &Queue, 232 const BinaryFunction &BF) { 233 // Define a comparison function to establish SWO between edges. 234 auto Comp = [&BF](const EdgeTy &A, const EdgeTy &B) { 235 // With equal weights, prioritize branches with lower index 236 // source/destination. This helps to keep original block order for blocks 237 // when optimal order cannot be deducted from a profile. 238 if (A.Count == B.Count) { 239 const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src); 240 return (SrcOrder != 0) 241 ? SrcOrder > 0 242 : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0; 243 } 244 return A.Count < B.Count; 245 }; 246 247 // Sort edges in increasing profile count order. 248 llvm::sort(Queue, Comp); 249 } 250 251 void PHGreedyClusterAlgorithm::adjustQueue(std::vector<EdgeTy> &Queue, 252 const BinaryFunction &BF) { 253 // Nothing to do. 254 } 255 256 bool PHGreedyClusterAlgorithm::areClustersCompatible(const ClusterTy &Front, 257 const ClusterTy &Back, 258 const EdgeTy &E) const { 259 return Front.back() == E.Src && Back.front() == E.Dst; 260 } 261 262 int64_t MinBranchGreedyClusterAlgorithm::calculateWeight( 263 const EdgeTy &E, const BinaryFunction &BF) const { 264 const BinaryBasicBlock *SrcBB = E.Src; 265 const BinaryBasicBlock *DstBB = E.Dst; 266 267 // Initial weight value. 268 int64_t W = (int64_t)E.Count; 269 270 // Adjust the weight by taking into account other edges with the same source. 271 auto BI = SrcBB->branch_info_begin(); 272 for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) { 273 assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && 274 "attempted reordering blocks of function with no profile data"); 275 assert(BI->Count <= std::numeric_limits<int64_t>::max() && 276 "overflow detected"); 277 // Ignore edges with same source and destination, edges that target the 278 // entry block as well as the edge E itself. 279 if (SuccBB != SrcBB && SuccBB != *BF.getLayout().block_begin() && 280 SuccBB != DstBB) 281 W -= (int64_t)BI->Count; 282 ++BI; 283 } 284 285 // Adjust the weight by taking into account other edges with the same 286 // destination. 287 for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) { 288 // Ignore edges with same source and destination as well as the edge E 289 // itself. 290 if (PredBB == DstBB || PredBB == SrcBB) 291 continue; 292 auto BI = PredBB->branch_info_begin(); 293 for (const BinaryBasicBlock *SuccBB : PredBB->successors()) { 294 if (SuccBB == DstBB) 295 break; 296 ++BI; 297 } 298 assert(BI != PredBB->branch_info_end() && "invalid control flow graph"); 299 assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && 300 "attempted reordering blocks of function with no profile data"); 301 assert(BI->Count <= std::numeric_limits<int64_t>::max() && 302 "overflow detected"); 303 W -= (int64_t)BI->Count; 304 } 305 306 return W; 307 } 308 309 void MinBranchGreedyClusterAlgorithm::initQueue(std::vector<EdgeTy> &Queue, 310 const BinaryFunction &BF) { 311 // Initialize edge weights. 312 for (const EdgeTy &E : Queue) 313 Weight.emplace(std::make_pair(E, calculateWeight(E, BF))); 314 315 // Sort edges in increasing weight order. 316 adjustQueue(Queue, BF); 317 } 318 319 void MinBranchGreedyClusterAlgorithm::adjustQueue(std::vector<EdgeTy> &Queue, 320 const BinaryFunction &BF) { 321 // Define a comparison function to establish SWO between edges. 322 auto Comp = [&](const EdgeTy &A, const EdgeTy &B) { 323 // With equal weights, prioritize branches with lower index 324 // source/destination. This helps to keep original block order for blocks 325 // when optimal order cannot be deduced from a profile. 326 if (Weight[A] == Weight[B]) { 327 const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src); 328 return (SrcOrder != 0) 329 ? SrcOrder > 0 330 : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0; 331 } 332 return Weight[A] < Weight[B]; 333 }; 334 335 // Iterate through all remaining edges to find edges that have their 336 // source and destination in the same cluster. 337 std::vector<EdgeTy> NewQueue; 338 for (const EdgeTy &E : Queue) { 339 const BinaryBasicBlock *SrcBB = E.Src; 340 const BinaryBasicBlock *DstBB = E.Dst; 341 342 // Case 1: SrcBB and DstBB are the same or DstBB is the entry block. Ignore 343 // this edge. 344 if (SrcBB == DstBB || DstBB == *BF.getLayout().block_begin()) { 345 LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs()); 346 dbgs() << " (same src, dst)\n"); 347 continue; 348 } 349 350 int I = BBToClusterMap[SrcBB]; 351 int J = BBToClusterMap[DstBB]; 352 std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I]; 353 std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J]; 354 355 // Case 2: They are already allocated at the same cluster or incompatible 356 // clusters. Adjust the weights of edges with the same source or 357 // destination, so that this edge has no effect on them any more, and ignore 358 // this edge. Also increase the intra- (or inter-) cluster edge count. 359 if (I == J || !areClustersCompatible(ClusterA, ClusterB, E)) { 360 if (!ClusterEdges.empty()) 361 ClusterEdges[I][J] += E.Count; 362 LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs()); 363 dbgs() << " (src, dst belong to same cluster or incompatible " 364 "clusters)\n"); 365 for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) { 366 if (SuccBB == DstBB) 367 continue; 368 auto WI = Weight.find(EdgeTy(SrcBB, SuccBB, 0)); 369 assert(WI != Weight.end() && "CFG edge not found in Weight map"); 370 WI->second += (int64_t)E.Count; 371 } 372 for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) { 373 if (PredBB == SrcBB) 374 continue; 375 auto WI = Weight.find(EdgeTy(PredBB, DstBB, 0)); 376 assert(WI != Weight.end() && "CFG edge not found in Weight map"); 377 WI->second += (int64_t)E.Count; 378 } 379 continue; 380 } 381 382 // Case 3: None of the previous cases is true, so just keep this edge in 383 // the queue. 384 NewQueue.emplace_back(E); 385 } 386 387 // Sort remaining edges in increasing weight order. 388 Queue.swap(NewQueue); 389 llvm::sort(Queue, Comp); 390 } 391 392 bool MinBranchGreedyClusterAlgorithm::areClustersCompatible( 393 const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const { 394 return Front.back() == E.Src && Back.front() == E.Dst; 395 } 396 397 void MinBranchGreedyClusterAlgorithm::reset() { 398 GreedyClusterAlgorithm::reset(); 399 Weight.clear(); 400 } 401 402 void TSPReorderAlgorithm::reorderBasicBlocks(BinaryFunction &BF, 403 BasicBlockOrder &Order) const { 404 std::vector<std::vector<uint64_t>> Weight; 405 std::vector<BinaryBasicBlock *> IndexToBB; 406 407 const size_t N = BF.getLayout().block_size(); 408 assert(N <= std::numeric_limits<uint64_t>::digits && 409 "cannot use TSP solution for sizes larger than bits in uint64_t"); 410 411 // Populating weight map and index map 412 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { 413 BB->setLayoutIndex(IndexToBB.size()); 414 IndexToBB.push_back(BB); 415 } 416 Weight.resize(N); 417 for (const BinaryBasicBlock *BB : BF.getLayout().blocks()) { 418 auto BI = BB->branch_info_begin(); 419 Weight[BB->getLayoutIndex()].resize(N); 420 for (BinaryBasicBlock *SuccBB : BB->successors()) { 421 if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) 422 Weight[BB->getLayoutIndex()][SuccBB->getLayoutIndex()] = BI->Count; 423 ++BI; 424 } 425 } 426 427 std::vector<std::vector<int64_t>> DP; 428 DP.resize(1 << N); 429 for (std::vector<int64_t> &Elmt : DP) 430 Elmt.resize(N, -1); 431 432 // Start with the entry basic block being allocated with cost zero 433 DP[1][0] = 0; 434 // Walk through TSP solutions using a bitmask to represent state (current set 435 // of BBs in the layout) 436 uint64_t BestSet = 1; 437 uint64_t BestLast = 0; 438 int64_t BestWeight = 0; 439 for (uint64_t Set = 1; Set < (1ULL << N); ++Set) { 440 // Traverse each possibility of Last BB visited in this layout 441 for (uint64_t Last = 0; Last < N; ++Last) { 442 // Case 1: There is no possible layout with this BB as Last 443 if (DP[Set][Last] == -1) 444 continue; 445 446 // Case 2: There is a layout with this Set and this Last, and we try 447 // to expand this set with New 448 for (uint64_t New = 1; New < N; ++New) { 449 // Case 2a: BB "New" is already in this Set 450 if ((Set & (1ULL << New)) != 0) 451 continue; 452 453 // Case 2b: BB "New" is not in this set and we add it to this Set and 454 // record total weight of this layout with "New" as the last BB. 455 uint64_t NewSet = (Set | (1ULL << New)); 456 if (DP[NewSet][New] == -1) 457 DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New]; 458 DP[NewSet][New] = std::max(DP[NewSet][New], 459 DP[Set][Last] + (int64_t)Weight[Last][New]); 460 461 if (DP[NewSet][New] > BestWeight) { 462 BestWeight = DP[NewSet][New]; 463 BestSet = NewSet; 464 BestLast = New; 465 } 466 } 467 } 468 } 469 470 // Define final function layout based on layout that maximizes weight 471 uint64_t Last = BestLast; 472 uint64_t Set = BestSet; 473 BitVector Visited; 474 Visited.resize(N); 475 Visited[Last] = true; 476 Order.push_back(IndexToBB[Last]); 477 Set = Set & ~(1ULL << Last); 478 while (Set != 0) { 479 int64_t Best = -1; 480 uint64_t NewLast; 481 for (uint64_t I = 0; I < N; ++I) { 482 if (DP[Set][I] == -1) 483 continue; 484 int64_t AdjWeight = Weight[I][Last] > 0 ? Weight[I][Last] : 0; 485 if (DP[Set][I] + AdjWeight > Best) { 486 NewLast = I; 487 Best = DP[Set][I] + AdjWeight; 488 } 489 } 490 Last = NewLast; 491 Visited[Last] = true; 492 Order.push_back(IndexToBB[Last]); 493 Set = Set & ~(1ULL << Last); 494 } 495 std::reverse(Order.begin(), Order.end()); 496 497 // Finalize layout with BBs that weren't assigned to the layout using the 498 // input layout. 499 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) 500 if (Visited[BB->getLayoutIndex()] == false) 501 Order.push_back(BB); 502 } 503 504 void ExtTSPReorderAlgorithm::reorderBasicBlocks(BinaryFunction &BF, 505 BasicBlockOrder &Order) const { 506 if (BF.getLayout().block_empty()) 507 return; 508 509 // Do not change layout of functions w/o profile information 510 if (!BF.hasValidProfile() || BF.getLayout().block_size() <= 2) { 511 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) 512 Order.push_back(BB); 513 return; 514 } 515 516 // Create a separate MCCodeEmitter to allow lock-free execution 517 BinaryContext::IndependentCodeEmitter Emitter; 518 if (!opts::NoThreads) 519 Emitter = BF.getBinaryContext().createIndependentMCCodeEmitter(); 520 521 // Initialize CFG nodes and their data 522 std::vector<uint64_t> BlockSizes; 523 std::vector<uint64_t> BlockCounts; 524 BasicBlockOrder OrigOrder; 525 BF.getLayout().updateLayoutIndices(); 526 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { 527 uint64_t Size = std::max<uint64_t>(BB->estimateSize(Emitter.MCE.get()), 1); 528 BlockSizes.push_back(Size); 529 BlockCounts.push_back(BB->getKnownExecutionCount()); 530 OrigOrder.push_back(BB); 531 } 532 533 // Initialize CFG edges 534 using JumpT = std::pair<uint64_t, uint64_t>; 535 std::vector<std::pair<JumpT, uint64_t>> JumpCounts; 536 for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { 537 auto BI = BB->branch_info_begin(); 538 for (BinaryBasicBlock *SuccBB : BB->successors()) { 539 assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && 540 "missing profile for a jump"); 541 auto It = std::make_pair(BB->getLayoutIndex(), SuccBB->getLayoutIndex()); 542 JumpCounts.push_back(std::make_pair(It, BI->Count)); 543 ++BI; 544 } 545 } 546 547 // Run the layout algorithm 548 auto Result = applyExtTspLayout(BlockSizes, BlockCounts, JumpCounts); 549 Order.reserve(BF.getLayout().block_size()); 550 for (uint64_t R : Result) 551 Order.push_back(OrigOrder[R]); 552 } 553 554 void OptimizeReorderAlgorithm::reorderBasicBlocks( 555 BinaryFunction &BF, BasicBlockOrder &Order) const { 556 if (BF.getLayout().block_empty()) 557 return; 558 559 // Cluster basic blocks. 560 CAlgo->clusterBasicBlocks(BF); 561 562 if (opts::PrintClusters) 563 CAlgo->printClusters(); 564 565 // Arrange basic blocks according to clusters. 566 for (ClusterAlgorithm::ClusterTy &Cluster : CAlgo->Clusters) 567 Order.insert(Order.end(), Cluster.begin(), Cluster.end()); 568 } 569 570 void OptimizeBranchReorderAlgorithm::reorderBasicBlocks( 571 BinaryFunction &BF, BasicBlockOrder &Order) const { 572 if (BF.getLayout().block_empty()) 573 return; 574 575 // Cluster basic blocks. 576 CAlgo->clusterBasicBlocks(BF, /* ComputeEdges = */ true); 577 std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters; 578 std::vector<std::unordered_map<uint32_t, uint64_t>> &ClusterEdges = 579 CAlgo->ClusterEdges; 580 581 // Compute clusters' average frequencies. 582 CAlgo->computeClusterAverageFrequency(BF.getBinaryContext()); 583 std::vector<double> &AvgFreq = CAlgo->AvgFreq; 584 585 if (opts::PrintClusters) 586 CAlgo->printClusters(); 587 588 // Cluster layout order 589 std::vector<uint32_t> ClusterOrder; 590 591 // Do a topological sort for clusters, prioritizing frequently-executed BBs 592 // during the traversal. 593 std::stack<uint32_t> Stack; 594 std::vector<uint32_t> Status; 595 std::vector<uint32_t> Parent; 596 Status.resize(Clusters.size(), 0); 597 Parent.resize(Clusters.size(), 0); 598 constexpr uint32_t STACKED = 1; 599 constexpr uint32_t VISITED = 2; 600 Status[0] = STACKED; 601 Stack.push(0); 602 while (!Stack.empty()) { 603 uint32_t I = Stack.top(); 604 if (!(Status[I] & VISITED)) { 605 Status[I] |= VISITED; 606 // Order successors by weight 607 auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) { 608 return ClusterEdges[I][A] > ClusterEdges[I][B]; 609 }; 610 std::priority_queue<uint32_t, std::vector<uint32_t>, 611 decltype(ClusterComp)> 612 SuccQueue(ClusterComp); 613 for (std::pair<const uint32_t, uint64_t> &Target : ClusterEdges[I]) { 614 if (Target.second > 0 && !(Status[Target.first] & STACKED) && 615 !Clusters[Target.first].empty()) { 616 Parent[Target.first] = I; 617 Status[Target.first] = STACKED; 618 SuccQueue.push(Target.first); 619 } 620 } 621 while (!SuccQueue.empty()) { 622 Stack.push(SuccQueue.top()); 623 SuccQueue.pop(); 624 } 625 continue; 626 } 627 // Already visited this node 628 Stack.pop(); 629 ClusterOrder.push_back(I); 630 } 631 std::reverse(ClusterOrder.begin(), ClusterOrder.end()); 632 // Put unreachable clusters at the end 633 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) 634 if (!(Status[I] & VISITED) && !Clusters[I].empty()) 635 ClusterOrder.push_back(I); 636 637 // Sort nodes with equal precedence 638 auto Beg = ClusterOrder.begin(); 639 // Don't reorder the first cluster, which contains the function entry point 640 ++Beg; 641 std::stable_sort(Beg, ClusterOrder.end(), 642 [&AvgFreq, &Parent](uint32_t A, uint32_t B) { 643 uint32_t P = Parent[A]; 644 while (Parent[P] != 0) { 645 if (Parent[P] == B) 646 return false; 647 P = Parent[P]; 648 } 649 P = Parent[B]; 650 while (Parent[P] != 0) { 651 if (Parent[P] == A) 652 return true; 653 P = Parent[P]; 654 } 655 return AvgFreq[A] > AvgFreq[B]; 656 }); 657 658 if (opts::PrintClusters) { 659 errs() << "New cluster order: "; 660 const char *Sep = ""; 661 for (uint32_t O : ClusterOrder) { 662 errs() << Sep << O; 663 Sep = ", "; 664 } 665 errs() << '\n'; 666 } 667 668 // Arrange basic blocks according to cluster order. 669 for (uint32_t ClusterIndex : ClusterOrder) { 670 ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; 671 Order.insert(Order.end(), Cluster.begin(), Cluster.end()); 672 } 673 } 674 675 void OptimizeCacheReorderAlgorithm::reorderBasicBlocks( 676 BinaryFunction &BF, BasicBlockOrder &Order) const { 677 if (BF.getLayout().block_empty()) 678 return; 679 680 const uint64_t ColdThreshold = 681 opts::ColdThreshold * 682 (*BF.getLayout().block_begin())->getExecutionCount() / 1000; 683 684 // Cluster basic blocks. 685 CAlgo->clusterBasicBlocks(BF); 686 std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters; 687 688 // Compute clusters' average frequencies. 689 CAlgo->computeClusterAverageFrequency(BF.getBinaryContext()); 690 std::vector<double> &AvgFreq = CAlgo->AvgFreq; 691 692 if (opts::PrintClusters) 693 CAlgo->printClusters(); 694 695 // Cluster layout order 696 std::vector<uint32_t> ClusterOrder; 697 698 // Order clusters based on average instruction execution frequency 699 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) 700 if (!Clusters[I].empty()) 701 ClusterOrder.push_back(I); 702 // Don't reorder the first cluster, which contains the function entry point 703 std::stable_sort( 704 std::next(ClusterOrder.begin()), ClusterOrder.end(), 705 [&AvgFreq](uint32_t A, uint32_t B) { return AvgFreq[A] > AvgFreq[B]; }); 706 707 if (opts::PrintClusters) { 708 errs() << "New cluster order: "; 709 const char *Sep = ""; 710 for (uint32_t O : ClusterOrder) { 711 errs() << Sep << O; 712 Sep = ", "; 713 } 714 errs() << '\n'; 715 } 716 717 // Arrange basic blocks according to cluster order. 718 for (uint32_t ClusterIndex : ClusterOrder) { 719 ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; 720 Order.insert(Order.end(), Cluster.begin(), Cluster.end()); 721 // Force zero execution count on clusters that do not meet the cut off 722 // specified by --cold-threshold. 723 if (AvgFreq[ClusterIndex] < static_cast<double>(ColdThreshold)) 724 for (BinaryBasicBlock *BBPtr : Cluster) 725 BBPtr->setExecutionCount(0); 726 } 727 } 728 729 void ReverseReorderAlgorithm::reorderBasicBlocks(BinaryFunction &BF, 730 BasicBlockOrder &Order) const { 731 if (BF.getLayout().block_empty()) 732 return; 733 734 BinaryBasicBlock *FirstBB = *BF.getLayout().block_begin(); 735 Order.push_back(FirstBB); 736 for (auto RLI = BF.getLayout().block_rbegin(); *RLI != FirstBB; ++RLI) 737 Order.push_back(*RLI); 738 } 739 740 void RandomClusterReorderAlgorithm::reorderBasicBlocks( 741 BinaryFunction &BF, BasicBlockOrder &Order) const { 742 if (BF.getLayout().block_empty()) 743 return; 744 745 // Cluster basic blocks. 746 CAlgo->clusterBasicBlocks(BF); 747 std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters; 748 749 if (opts::PrintClusters) 750 CAlgo->printClusters(); 751 752 // Cluster layout order 753 std::vector<uint32_t> ClusterOrder; 754 755 // Order clusters based on average instruction execution frequency 756 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) 757 if (!Clusters[I].empty()) 758 ClusterOrder.push_back(I); 759 760 std::shuffle(std::next(ClusterOrder.begin()), ClusterOrder.end(), 761 std::default_random_engine(opts::RandomSeed.getValue())); 762 763 if (opts::PrintClusters) { 764 errs() << "New cluster order: "; 765 const char *Sep = ""; 766 for (uint32_t O : ClusterOrder) { 767 errs() << Sep << O; 768 Sep = ", "; 769 } 770 errs() << '\n'; 771 } 772 773 // Arrange basic blocks according to cluster order. 774 for (uint32_t ClusterIndex : ClusterOrder) { 775 ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; 776 Order.insert(Order.end(), Cluster.begin(), Cluster.end()); 777 } 778 } 779