1 //===- bolt/Passes/ReorderAlgorithm.cpp - Basic block reordering ----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements classes used by several basic block reordering 10 // algorithms. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "bolt/Passes/ReorderAlgorithm.h" 15 #include "bolt/Core/BinaryBasicBlock.h" 16 #include "bolt/Core/BinaryFunction.h" 17 #include "llvm/Support/CommandLine.h" 18 #include <queue> 19 #include <random> 20 #include <stack> 21 22 #undef DEBUG_TYPE 23 #define DEBUG_TYPE "bolt" 24 25 using namespace llvm; 26 using namespace bolt; 27 28 namespace opts { 29 30 extern cl::OptionCategory BoltOptCategory; 31 extern cl::opt<bool> NoThreads; 32 33 static cl::opt<unsigned> ColdThreshold( 34 "cold-threshold", 35 cl::desc("tenths of percents of main entry frequency to use as a " 36 "threshold when evaluating whether a basic block is cold " 37 "(0 means it is only considered cold if the block has zero " 38 "samples). Default: 0 "), 39 cl::init(0), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory)); 40 41 static cl::opt<bool> 42 PrintClusters("print-clusters", 43 cl::desc("print clusters"), 44 cl::ZeroOrMore, 45 cl::Hidden, 46 cl::cat(BoltOptCategory)); 47 48 cl::opt<uint32_t> 49 RandomSeed("bolt-seed", 50 cl::desc("seed for randomization"), 51 cl::init(42), 52 cl::ZeroOrMore, 53 cl::Hidden, 54 cl::cat(BoltOptCategory)); 55 56 } // namespace opts 57 58 namespace { 59 60 template <class T> inline void hashCombine(size_t &Seed, const T &Val) { 61 std::hash<T> Hasher; 62 Seed ^= Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2); 63 } 64 65 template <typename A, typename B> struct HashPair { 66 size_t operator()(const std::pair<A, B> &Val) const { 67 std::hash<A> Hasher; 68 size_t Seed = Hasher(Val.first); 69 hashCombine(Seed, Val.second); 70 return Seed; 71 } 72 }; 73 74 } // namespace 75 76 void ClusterAlgorithm::computeClusterAverageFrequency(const BinaryContext &BC) { 77 // Create a separate MCCodeEmitter to allow lock-free execution 78 BinaryContext::IndependentCodeEmitter Emitter; 79 if (!opts::NoThreads) 80 Emitter = BC.createIndependentMCCodeEmitter(); 81 82 AvgFreq.resize(Clusters.size(), 0.0); 83 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { 84 double Freq = 0.0; 85 uint64_t ClusterSize = 0; 86 for (BinaryBasicBlock *BB : Clusters[I]) { 87 if (BB->getNumNonPseudos() > 0) { 88 Freq += BB->getExecutionCount(); 89 // Estimate the size of a block in bytes at run time 90 // NOTE: This might be inaccurate 91 ClusterSize += BB->estimateSize(Emitter.MCE.get()); 92 } 93 } 94 AvgFreq[I] = ClusterSize == 0 ? 0 : Freq / ClusterSize; 95 } 96 } 97 98 void ClusterAlgorithm::printClusters() const { 99 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { 100 errs() << "Cluster number " << I; 101 if (AvgFreq.size() == Clusters.size()) 102 errs() << " (frequency: " << AvgFreq[I] << ")"; 103 errs() << " : "; 104 const char *Sep = ""; 105 for (BinaryBasicBlock *BB : Clusters[I]) { 106 errs() << Sep << BB->getName(); 107 Sep = ", "; 108 } 109 errs() << "\n"; 110 } 111 } 112 113 void ClusterAlgorithm::reset() { 114 Clusters.clear(); 115 ClusterEdges.clear(); 116 AvgFreq.clear(); 117 } 118 119 void GreedyClusterAlgorithm::EdgeTy::print(raw_ostream &OS) const { 120 OS << Src->getName() << " -> " << Dst->getName() << ", count: " << Count; 121 } 122 123 size_t GreedyClusterAlgorithm::EdgeHash::operator()(const EdgeTy &E) const { 124 HashPair<const BinaryBasicBlock *, const BinaryBasicBlock *> Hasher; 125 return Hasher(std::make_pair(E.Src, E.Dst)); 126 } 127 128 bool GreedyClusterAlgorithm::EdgeEqual::operator()(const EdgeTy &A, 129 const EdgeTy &B) const { 130 return A.Src == B.Src && A.Dst == B.Dst; 131 } 132 133 void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF, 134 bool ComputeEdges) { 135 reset(); 136 137 // Greedy heuristic implementation for the TSP, applied to BB layout. Try to 138 // maximize weight during a path traversing all BBs. In this way, we will 139 // convert the hottest branches into fall-throughs. 140 141 // This is the queue of edges from which we will pop edges and use them to 142 // cluster basic blocks in a greedy fashion. 143 std::vector<EdgeTy> Queue; 144 145 // Initialize inter-cluster weights. 146 if (ComputeEdges) 147 ClusterEdges.resize(BF.layout_size()); 148 149 // Initialize clusters and edge queue. 150 for (BinaryBasicBlock *BB : BF.layout()) { 151 // Create a cluster for this BB. 152 uint32_t I = Clusters.size(); 153 Clusters.emplace_back(); 154 std::vector<BinaryBasicBlock *> &Cluster = Clusters.back(); 155 Cluster.push_back(BB); 156 BBToClusterMap[BB] = I; 157 // Populate priority queue with edges. 158 auto BI = BB->branch_info_begin(); 159 for (BinaryBasicBlock *&I : BB->successors()) { 160 assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && 161 "attempted reordering blocks of function with no profile data"); 162 Queue.emplace_back(EdgeTy(BB, I, BI->Count)); 163 ++BI; 164 } 165 } 166 // Sort and adjust the edge queue. 167 initQueue(Queue, BF); 168 169 // Grow clusters in a greedy fashion. 170 while (!Queue.empty()) { 171 EdgeTy E = Queue.back(); 172 Queue.pop_back(); 173 174 const BinaryBasicBlock *SrcBB = E.Src; 175 const BinaryBasicBlock *DstBB = E.Dst; 176 177 LLVM_DEBUG(dbgs() << "Popped edge "; E.print(dbgs()); dbgs() << "\n"); 178 179 // Case 1: BBSrc and BBDst are the same. Ignore this edge 180 if (SrcBB == DstBB || DstBB == *BF.layout_begin()) { 181 LLVM_DEBUG(dbgs() << "\tIgnored (same src, dst)\n"); 182 continue; 183 } 184 185 int I = BBToClusterMap[SrcBB]; 186 int J = BBToClusterMap[DstBB]; 187 188 // Case 2: If they are already allocated at the same cluster, just increase 189 // the weight of this cluster 190 if (I == J) { 191 if (ComputeEdges) 192 ClusterEdges[I][I] += E.Count; 193 LLVM_DEBUG(dbgs() << "\tIgnored (src, dst belong to the same cluster)\n"); 194 continue; 195 } 196 197 std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I]; 198 std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J]; 199 if (areClustersCompatible(ClusterA, ClusterB, E)) { 200 // Case 3: SrcBB is at the end of a cluster and DstBB is at the start, 201 // allowing us to merge two clusters. 202 for (BinaryBasicBlock *BB : ClusterB) 203 BBToClusterMap[BB] = I; 204 ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end()); 205 ClusterB.clear(); 206 if (ComputeEdges) { 207 // Increase the intra-cluster edge count of cluster A with the count of 208 // this edge as well as with the total count of previously visited edges 209 // from cluster B cluster A. 210 ClusterEdges[I][I] += E.Count; 211 ClusterEdges[I][I] += ClusterEdges[J][I]; 212 // Iterate through all inter-cluster edges and transfer edges targeting 213 // cluster B to cluster A. 214 for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K) 215 ClusterEdges[K][I] += ClusterEdges[K][J]; 216 } 217 // Adjust the weights of the remaining edges and re-sort the queue. 218 adjustQueue(Queue, BF); 219 LLVM_DEBUG(dbgs() << "\tMerged clusters of src, dst\n"); 220 } else { 221 // Case 4: Both SrcBB and DstBB are allocated in positions we cannot 222 // merge them. Add the count of this edge to the inter-cluster edge count 223 // between clusters A and B to help us decide ordering between these 224 // clusters. 225 if (ComputeEdges) 226 ClusterEdges[I][J] += E.Count; 227 LLVM_DEBUG( 228 dbgs() << "\tIgnored (src, dst belong to incompatible clusters)\n"); 229 } 230 } 231 } 232 233 void GreedyClusterAlgorithm::reset() { 234 ClusterAlgorithm::reset(); 235 BBToClusterMap.clear(); 236 } 237 238 void PHGreedyClusterAlgorithm::initQueue(std::vector<EdgeTy> &Queue, 239 const BinaryFunction &BF) { 240 // Define a comparison function to establish SWO between edges. 241 auto Comp = [&BF](const EdgeTy &A, const EdgeTy &B) { 242 // With equal weights, prioritize branches with lower index 243 // source/destination. This helps to keep original block order for blocks 244 // when optimal order cannot be deducted from a profile. 245 if (A.Count == B.Count) { 246 const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src); 247 return (SrcOrder != 0) 248 ? SrcOrder > 0 249 : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0; 250 } 251 return A.Count < B.Count; 252 }; 253 254 // Sort edges in increasing profile count order. 255 std::sort(Queue.begin(), Queue.end(), Comp); 256 } 257 258 void PHGreedyClusterAlgorithm::adjustQueue(std::vector<EdgeTy> &Queue, 259 const BinaryFunction &BF) { 260 // Nothing to do. 261 return; 262 } 263 264 bool PHGreedyClusterAlgorithm::areClustersCompatible(const ClusterTy &Front, 265 const ClusterTy &Back, 266 const EdgeTy &E) const { 267 return Front.back() == E.Src && Back.front() == E.Dst; 268 } 269 270 int64_t MinBranchGreedyClusterAlgorithm::calculateWeight( 271 const EdgeTy &E, const BinaryFunction &BF) const { 272 const BinaryBasicBlock *SrcBB = E.Src; 273 const BinaryBasicBlock *DstBB = E.Dst; 274 275 // Initial weight value. 276 int64_t W = (int64_t)E.Count; 277 278 // Adjust the weight by taking into account other edges with the same source. 279 auto BI = SrcBB->branch_info_begin(); 280 for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) { 281 assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && 282 "attempted reordering blocks of function with no profile data"); 283 assert(BI->Count <= std::numeric_limits<int64_t>::max() && 284 "overflow detected"); 285 // Ignore edges with same source and destination, edges that target the 286 // entry block as well as the edge E itself. 287 if (SuccBB != SrcBB && SuccBB != *BF.layout_begin() && SuccBB != DstBB) 288 W -= (int64_t)BI->Count; 289 ++BI; 290 } 291 292 // Adjust the weight by taking into account other edges with the same 293 // destination. 294 for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) { 295 // Ignore edges with same source and destination as well as the edge E 296 // itself. 297 if (PredBB == DstBB || PredBB == SrcBB) 298 continue; 299 auto BI = PredBB->branch_info_begin(); 300 for (const BinaryBasicBlock *SuccBB : PredBB->successors()) { 301 if (SuccBB == DstBB) 302 break; 303 ++BI; 304 } 305 assert(BI != PredBB->branch_info_end() && "invalid control flow graph"); 306 assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && 307 "attempted reordering blocks of function with no profile data"); 308 assert(BI->Count <= std::numeric_limits<int64_t>::max() && 309 "overflow detected"); 310 W -= (int64_t)BI->Count; 311 } 312 313 return W; 314 } 315 316 void MinBranchGreedyClusterAlgorithm::initQueue(std::vector<EdgeTy> &Queue, 317 const BinaryFunction &BF) { 318 // Initialize edge weights. 319 for (const EdgeTy &E : Queue) 320 Weight.emplace(std::make_pair(E, calculateWeight(E, BF))); 321 322 // Sort edges in increasing weight order. 323 adjustQueue(Queue, BF); 324 } 325 326 void MinBranchGreedyClusterAlgorithm::adjustQueue(std::vector<EdgeTy> &Queue, 327 const BinaryFunction &BF) { 328 // Define a comparison function to establish SWO between edges. 329 auto Comp = [&](const EdgeTy &A, const EdgeTy &B) { 330 // With equal weights, prioritize branches with lower index 331 // source/destination. This helps to keep original block order for blocks 332 // when optimal order cannot be deduced from a profile. 333 if (Weight[A] == Weight[B]) { 334 const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src); 335 return (SrcOrder != 0) 336 ? SrcOrder > 0 337 : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0; 338 } 339 return Weight[A] < Weight[B]; 340 }; 341 342 // Iterate through all remaining edges to find edges that have their 343 // source and destination in the same cluster. 344 std::vector<EdgeTy> NewQueue; 345 for (const EdgeTy &E : Queue) { 346 const BinaryBasicBlock *SrcBB = E.Src; 347 const BinaryBasicBlock *DstBB = E.Dst; 348 349 // Case 1: SrcBB and DstBB are the same or DstBB is the entry block. Ignore 350 // this edge. 351 if (SrcBB == DstBB || DstBB == *BF.layout_begin()) { 352 LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs()); 353 dbgs() << " (same src, dst)\n"); 354 continue; 355 } 356 357 int I = BBToClusterMap[SrcBB]; 358 int J = BBToClusterMap[DstBB]; 359 std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I]; 360 std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J]; 361 362 // Case 2: They are already allocated at the same cluster or incompatible 363 // clusters. Adjust the weights of edges with the same source or 364 // destination, so that this edge has no effect on them any more, and ignore 365 // this edge. Also increase the intra- (or inter-) cluster edge count. 366 if (I == J || !areClustersCompatible(ClusterA, ClusterB, E)) { 367 if (!ClusterEdges.empty()) 368 ClusterEdges[I][J] += E.Count; 369 LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs()); 370 dbgs() << " (src, dst belong to same cluster or incompatible " 371 "clusters)\n"); 372 for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) { 373 if (SuccBB == DstBB) 374 continue; 375 auto WI = Weight.find(EdgeTy(SrcBB, SuccBB, 0)); 376 assert(WI != Weight.end() && "CFG edge not found in Weight map"); 377 WI->second += (int64_t)E.Count; 378 } 379 for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) { 380 if (PredBB == SrcBB) 381 continue; 382 auto WI = Weight.find(EdgeTy(PredBB, DstBB, 0)); 383 assert(WI != Weight.end() && "CFG edge not found in Weight map"); 384 WI->second += (int64_t)E.Count; 385 } 386 continue; 387 } 388 389 // Case 3: None of the previous cases is true, so just keep this edge in 390 // the queue. 391 NewQueue.emplace_back(E); 392 } 393 394 // Sort remaining edges in increasing weight order. 395 Queue.swap(NewQueue); 396 std::sort(Queue.begin(), Queue.end(), Comp); 397 } 398 399 bool MinBranchGreedyClusterAlgorithm::areClustersCompatible( 400 const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const { 401 return Front.back() == E.Src && Back.front() == E.Dst; 402 } 403 404 void MinBranchGreedyClusterAlgorithm::reset() { 405 GreedyClusterAlgorithm::reset(); 406 Weight.clear(); 407 } 408 409 void TSPReorderAlgorithm::reorderBasicBlocks(const BinaryFunction &BF, 410 BasicBlockOrder &Order) const { 411 std::vector<std::vector<uint64_t>> Weight; 412 std::vector<BinaryBasicBlock *> IndexToBB; 413 414 const size_t N = BF.layout_size(); 415 assert(N <= std::numeric_limits<uint64_t>::digits && 416 "cannot use TSP solution for sizes larger than bits in uint64_t"); 417 418 // Populating weight map and index map 419 for (BinaryBasicBlock *BB : BF.layout()) { 420 BB->setLayoutIndex(IndexToBB.size()); 421 IndexToBB.push_back(BB); 422 } 423 Weight.resize(N); 424 for (BinaryBasicBlock *BB : BF.layout()) { 425 auto BI = BB->branch_info_begin(); 426 Weight[BB->getLayoutIndex()].resize(N); 427 for (BinaryBasicBlock *SuccBB : BB->successors()) { 428 if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) 429 Weight[BB->getLayoutIndex()][SuccBB->getLayoutIndex()] = BI->Count; 430 ++BI; 431 } 432 } 433 434 std::vector<std::vector<int64_t>> DP; 435 DP.resize(1 << N); 436 for (std::vector<int64_t> &Elmt : DP) 437 Elmt.resize(N, -1); 438 439 // Start with the entry basic block being allocated with cost zero 440 DP[1][0] = 0; 441 // Walk through TSP solutions using a bitmask to represent state (current set 442 // of BBs in the layout) 443 uint64_t BestSet = 1; 444 uint64_t BestLast = 0; 445 int64_t BestWeight = 0; 446 for (uint64_t Set = 1; Set < (1ULL << N); ++Set) { 447 // Traverse each possibility of Last BB visited in this layout 448 for (uint64_t Last = 0; Last < N; ++Last) { 449 // Case 1: There is no possible layout with this BB as Last 450 if (DP[Set][Last] == -1) 451 continue; 452 453 // Case 2: There is a layout with this Set and this Last, and we try 454 // to expand this set with New 455 for (uint64_t New = 1; New < N; ++New) { 456 // Case 2a: BB "New" is already in this Set 457 if ((Set & (1ULL << New)) != 0) 458 continue; 459 460 // Case 2b: BB "New" is not in this set and we add it to this Set and 461 // record total weight of this layout with "New" as the last BB. 462 uint64_t NewSet = (Set | (1ULL << New)); 463 if (DP[NewSet][New] == -1) 464 DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New]; 465 DP[NewSet][New] = std::max(DP[NewSet][New], 466 DP[Set][Last] + (int64_t)Weight[Last][New]); 467 468 if (DP[NewSet][New] > BestWeight) { 469 BestWeight = DP[NewSet][New]; 470 BestSet = NewSet; 471 BestLast = New; 472 } 473 } 474 } 475 } 476 477 // Define final function layout based on layout that maximizes weight 478 uint64_t Last = BestLast; 479 uint64_t Set = BestSet; 480 std::vector<bool> Visited; 481 Visited.resize(N); 482 Visited[Last] = true; 483 Order.push_back(IndexToBB[Last]); 484 Set = Set & ~(1ULL << Last); 485 while (Set != 0) { 486 int64_t Best = -1; 487 uint64_t NewLast; 488 for (uint64_t I = 0; I < N; ++I) { 489 if (DP[Set][I] == -1) 490 continue; 491 int64_t AdjWeight = Weight[I][Last] > 0 ? Weight[I][Last] : 0; 492 if (DP[Set][I] + AdjWeight > Best) { 493 NewLast = I; 494 Best = DP[Set][I] + AdjWeight; 495 } 496 } 497 Last = NewLast; 498 Visited[Last] = true; 499 Order.push_back(IndexToBB[Last]); 500 Set = Set & ~(1ULL << Last); 501 } 502 std::reverse(Order.begin(), Order.end()); 503 504 // Finalize layout with BBs that weren't assigned to the layout using the 505 // input layout. 506 for (BinaryBasicBlock *BB : BF.layout()) 507 if (Visited[BB->getLayoutIndex()] == false) 508 Order.push_back(BB); 509 } 510 511 void OptimizeReorderAlgorithm::reorderBasicBlocks( 512 const BinaryFunction &BF, BasicBlockOrder &Order) const { 513 if (BF.layout_empty()) 514 return; 515 516 // Cluster basic blocks. 517 CAlgo->clusterBasicBlocks(BF); 518 519 if (opts::PrintClusters) 520 CAlgo->printClusters(); 521 522 // Arrange basic blocks according to clusters. 523 for (ClusterAlgorithm::ClusterTy &Cluster : CAlgo->Clusters) 524 Order.insert(Order.end(), Cluster.begin(), Cluster.end()); 525 } 526 527 void OptimizeBranchReorderAlgorithm::reorderBasicBlocks( 528 const BinaryFunction &BF, BasicBlockOrder &Order) const { 529 if (BF.layout_empty()) 530 return; 531 532 // Cluster basic blocks. 533 CAlgo->clusterBasicBlocks(BF, /* ComputeEdges = */ true); 534 std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters; 535 std::vector<std::unordered_map<uint32_t, uint64_t>> &ClusterEdges = 536 CAlgo->ClusterEdges; 537 538 // Compute clusters' average frequencies. 539 CAlgo->computeClusterAverageFrequency(BF.getBinaryContext()); 540 std::vector<double> &AvgFreq = CAlgo->AvgFreq; 541 542 if (opts::PrintClusters) 543 CAlgo->printClusters(); 544 545 // Cluster layout order 546 std::vector<uint32_t> ClusterOrder; 547 548 // Do a topological sort for clusters, prioritizing frequently-executed BBs 549 // during the traversal. 550 std::stack<uint32_t> Stack; 551 std::vector<uint32_t> Status; 552 std::vector<uint32_t> Parent; 553 Status.resize(Clusters.size(), 0); 554 Parent.resize(Clusters.size(), 0); 555 constexpr uint32_t STACKED = 1; 556 constexpr uint32_t VISITED = 2; 557 Status[0] = STACKED; 558 Stack.push(0); 559 while (!Stack.empty()) { 560 uint32_t I = Stack.top(); 561 if (!(Status[I] & VISITED)) { 562 Status[I] |= VISITED; 563 // Order successors by weight 564 auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) { 565 return ClusterEdges[I][A] > ClusterEdges[I][B]; 566 }; 567 std::priority_queue<uint32_t, std::vector<uint32_t>, 568 decltype(ClusterComp)> 569 SuccQueue(ClusterComp); 570 for (std::pair<const uint32_t, uint64_t> &Target : ClusterEdges[I]) { 571 if (Target.second > 0 && !(Status[Target.first] & STACKED) && 572 !Clusters[Target.first].empty()) { 573 Parent[Target.first] = I; 574 Status[Target.first] = STACKED; 575 SuccQueue.push(Target.first); 576 } 577 } 578 while (!SuccQueue.empty()) { 579 Stack.push(SuccQueue.top()); 580 SuccQueue.pop(); 581 } 582 continue; 583 } 584 // Already visited this node 585 Stack.pop(); 586 ClusterOrder.push_back(I); 587 } 588 std::reverse(ClusterOrder.begin(), ClusterOrder.end()); 589 // Put unreachable clusters at the end 590 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) 591 if (!(Status[I] & VISITED) && !Clusters[I].empty()) 592 ClusterOrder.push_back(I); 593 594 // Sort nodes with equal precedence 595 auto Beg = ClusterOrder.begin(); 596 // Don't reorder the first cluster, which contains the function entry point 597 ++Beg; 598 std::stable_sort(Beg, ClusterOrder.end(), 599 [&AvgFreq, &Parent](uint32_t A, uint32_t B) { 600 uint32_t P = Parent[A]; 601 while (Parent[P] != 0) { 602 if (Parent[P] == B) 603 return false; 604 P = Parent[P]; 605 } 606 P = Parent[B]; 607 while (Parent[P] != 0) { 608 if (Parent[P] == A) 609 return true; 610 P = Parent[P]; 611 } 612 return AvgFreq[A] > AvgFreq[B]; 613 }); 614 615 if (opts::PrintClusters) { 616 errs() << "New cluster order: "; 617 const char *Sep = ""; 618 for (uint32_t O : ClusterOrder) { 619 errs() << Sep << O; 620 Sep = ", "; 621 } 622 errs() << '\n'; 623 } 624 625 // Arrange basic blocks according to cluster order. 626 for (uint32_t ClusterIndex : ClusterOrder) { 627 ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; 628 Order.insert(Order.end(), Cluster.begin(), Cluster.end()); 629 } 630 } 631 632 void OptimizeCacheReorderAlgorithm::reorderBasicBlocks( 633 const BinaryFunction &BF, BasicBlockOrder &Order) const { 634 if (BF.layout_empty()) 635 return; 636 637 const uint64_t ColdThreshold = 638 opts::ColdThreshold * (*BF.layout_begin())->getExecutionCount() / 1000; 639 640 // Cluster basic blocks. 641 CAlgo->clusterBasicBlocks(BF); 642 std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters; 643 644 // Compute clusters' average frequencies. 645 CAlgo->computeClusterAverageFrequency(BF.getBinaryContext()); 646 std::vector<double> &AvgFreq = CAlgo->AvgFreq; 647 648 if (opts::PrintClusters) 649 CAlgo->printClusters(); 650 651 // Cluster layout order 652 std::vector<uint32_t> ClusterOrder; 653 654 // Order clusters based on average instruction execution frequency 655 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) 656 if (!Clusters[I].empty()) 657 ClusterOrder.push_back(I); 658 // Don't reorder the first cluster, which contains the function entry point 659 std::stable_sort( 660 std::next(ClusterOrder.begin()), ClusterOrder.end(), 661 [&AvgFreq](uint32_t A, uint32_t B) { return AvgFreq[A] > AvgFreq[B]; }); 662 663 if (opts::PrintClusters) { 664 errs() << "New cluster order: "; 665 const char *Sep = ""; 666 for (uint32_t O : ClusterOrder) { 667 errs() << Sep << O; 668 Sep = ", "; 669 } 670 errs() << '\n'; 671 } 672 673 // Arrange basic blocks according to cluster order. 674 for (uint32_t ClusterIndex : ClusterOrder) { 675 ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; 676 Order.insert(Order.end(), Cluster.begin(), Cluster.end()); 677 // Force zero execution count on clusters that do not meet the cut off 678 // specified by --cold-threshold. 679 if (AvgFreq[ClusterIndex] < static_cast<double>(ColdThreshold)) 680 for (BinaryBasicBlock *BBPtr : Cluster) 681 BBPtr->setExecutionCount(0); 682 } 683 } 684 685 void ReverseReorderAlgorithm::reorderBasicBlocks(const BinaryFunction &BF, 686 BasicBlockOrder &Order) const { 687 if (BF.layout_empty()) 688 return; 689 690 BinaryBasicBlock *FirstBB = *BF.layout_begin(); 691 Order.push_back(FirstBB); 692 for (auto RLI = BF.layout_rbegin(); *RLI != FirstBB; ++RLI) 693 Order.push_back(*RLI); 694 } 695 696 void RandomClusterReorderAlgorithm::reorderBasicBlocks( 697 const BinaryFunction &BF, BasicBlockOrder &Order) const { 698 if (BF.layout_empty()) 699 return; 700 701 // Cluster basic blocks. 702 CAlgo->clusterBasicBlocks(BF); 703 std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters; 704 705 if (opts::PrintClusters) 706 CAlgo->printClusters(); 707 708 // Cluster layout order 709 std::vector<uint32_t> ClusterOrder; 710 711 // Order clusters based on average instruction execution frequency 712 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) 713 if (!Clusters[I].empty()) 714 ClusterOrder.push_back(I); 715 716 std::shuffle(std::next(ClusterOrder.begin()), ClusterOrder.end(), 717 std::default_random_engine(opts::RandomSeed.getValue())); 718 719 if (opts::PrintClusters) { 720 errs() << "New cluster order: "; 721 const char *Sep = ""; 722 for (uint32_t O : ClusterOrder) { 723 errs() << Sep << O; 724 Sep = ", "; 725 } 726 errs() << '\n'; 727 } 728 729 // Arrange basic blocks according to cluster order. 730 for (uint32_t ClusterIndex : ClusterOrder) { 731 ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; 732 Order.insert(Order.end(), Cluster.begin(), Cluster.end()); 733 } 734 } 735