1 //===--- Passes/ReorderAlgorithm.cpp - Basic block reorderng algorithms ---===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // Implements different basic block reordering algorithms. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "bolt/Passes/ReorderAlgorithm.h" 14 #include "bolt/Core/BinaryBasicBlock.h" 15 #include "bolt/Core/BinaryFunction.h" 16 #include "llvm/Support/CommandLine.h" 17 #include <queue> 18 #include <random> 19 #include <stack> 20 21 #undef DEBUG_TYPE 22 #define DEBUG_TYPE "bolt" 23 24 using namespace llvm; 25 using namespace bolt; 26 27 namespace opts { 28 29 extern cl::OptionCategory BoltOptCategory; 30 extern cl::opt<bool> NoThreads; 31 32 static cl::opt<unsigned> ColdThreshold( 33 "cold-threshold", 34 cl::desc("tenths of percents of main entry frequency to use as a " 35 "threshold when evaluating whether a basic block is cold " 36 "(0 means it is only considered cold if the block has zero " 37 "samples). Default: 0 "), 38 cl::init(0), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory)); 39 40 static cl::opt<bool> 41 PrintClusters("print-clusters", 42 cl::desc("print clusters"), 43 cl::ZeroOrMore, 44 cl::Hidden, 45 cl::cat(BoltOptCategory)); 46 47 cl::opt<uint32_t> 48 RandomSeed("bolt-seed", 49 cl::desc("seed for randomization"), 50 cl::init(42), 51 cl::ZeroOrMore, 52 cl::Hidden, 53 cl::cat(BoltOptCategory)); 54 55 } // namespace opts 56 57 namespace { 58 59 template <class T> 60 inline void hashCombine(size_t &Seed, const T &Val) { 61 std::hash<T> Hasher; 62 Seed ^= Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2); 63 } 64 65 template <typename A, typename B> 66 struct HashPair { 67 size_t operator()(const std::pair<A,B>& Val) const { 68 std::hash<A> Hasher; 69 size_t Seed = Hasher(Val.first); 70 hashCombine(Seed, Val.second); 71 return Seed; 72 } 73 }; 74 75 } 76 77 void ClusterAlgorithm::computeClusterAverageFrequency(const BinaryContext &BC) { 78 // Create a separate MCCodeEmitter to allow lock-free execution 79 BinaryContext::IndependentCodeEmitter Emitter; 80 if (!opts::NoThreads) { 81 Emitter = BC.createIndependentMCCodeEmitter(); 82 } 83 84 AvgFreq.resize(Clusters.size(), 0.0); 85 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { 86 double Freq = 0.0; 87 uint64_t ClusterSize = 0; 88 for (BinaryBasicBlock *BB : Clusters[I]) { 89 if (BB->getNumNonPseudos() > 0) { 90 Freq += BB->getExecutionCount(); 91 // Estimate the size of a block in bytes at run time 92 // NOTE: This might be inaccurate 93 ClusterSize += BB->estimateSize(Emitter.MCE.get()); 94 } 95 } 96 AvgFreq[I] = ClusterSize == 0 ? 0 : Freq / ClusterSize; 97 } 98 } 99 100 void ClusterAlgorithm::printClusters() const { 101 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { 102 errs() << "Cluster number " << I; 103 if (AvgFreq.size() == Clusters.size()) 104 errs() << " (frequency: " << AvgFreq[I] << ")"; 105 errs() << " : "; 106 const char *Sep = ""; 107 for (BinaryBasicBlock *BB : Clusters[I]) { 108 errs() << Sep << BB->getName(); 109 Sep = ", "; 110 } 111 errs() << "\n"; 112 } 113 } 114 115 void ClusterAlgorithm::reset() { 116 Clusters.clear(); 117 ClusterEdges.clear(); 118 AvgFreq.clear(); 119 } 120 121 void GreedyClusterAlgorithm::EdgeTy::print(raw_ostream &OS) const { 122 OS << Src->getName() << " -> " << Dst->getName() << ", count: " << Count; 123 } 124 125 size_t GreedyClusterAlgorithm::EdgeHash::operator()(const EdgeTy &E) const { 126 HashPair<const BinaryBasicBlock *, const BinaryBasicBlock *> Hasher; 127 return Hasher(std::make_pair(E.Src, E.Dst)); 128 } 129 130 bool GreedyClusterAlgorithm::EdgeEqual::operator()( 131 const EdgeTy &A, const EdgeTy &B) const { 132 return A.Src == B.Src && A.Dst == B.Dst; 133 } 134 135 void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF, 136 bool ComputeEdges) { 137 reset(); 138 139 // Greedy heuristic implementation for the TSP, applied to BB layout. Try to 140 // maximize weight during a path traversing all BBs. In this way, we will 141 // convert the hottest branches into fall-throughs. 142 143 // This is the queue of edges from which we will pop edges and use them to 144 // cluster basic blocks in a greedy fashion. 145 std::vector<EdgeTy> Queue; 146 147 // Initialize inter-cluster weights. 148 if (ComputeEdges) 149 ClusterEdges.resize(BF.layout_size()); 150 151 // Initialize clusters and edge queue. 152 for (BinaryBasicBlock *BB : BF.layout()) { 153 // Create a cluster for this BB. 154 uint32_t I = Clusters.size(); 155 Clusters.emplace_back(); 156 std::vector<BinaryBasicBlock *> &Cluster = Clusters.back(); 157 Cluster.push_back(BB); 158 BBToClusterMap[BB] = I; 159 // Populate priority queue with edges. 160 auto BI = BB->branch_info_begin(); 161 for (BinaryBasicBlock *&I : BB->successors()) { 162 assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && 163 "attempted reordering blocks of function with no profile data"); 164 Queue.emplace_back(EdgeTy(BB, I, BI->Count)); 165 ++BI; 166 } 167 } 168 // Sort and adjust the edge queue. 169 initQueue(Queue, BF); 170 171 // Grow clusters in a greedy fashion. 172 while (!Queue.empty()) { 173 EdgeTy E = Queue.back(); 174 Queue.pop_back(); 175 176 const BinaryBasicBlock *SrcBB = E.Src; 177 const BinaryBasicBlock *DstBB = E.Dst; 178 179 LLVM_DEBUG(dbgs() << "Popped edge "; E.print(dbgs()); dbgs() << "\n"); 180 181 // Case 1: BBSrc and BBDst are the same. Ignore this edge 182 if (SrcBB == DstBB || DstBB == *BF.layout_begin()) { 183 LLVM_DEBUG(dbgs() << "\tIgnored (same src, dst)\n"); 184 continue; 185 } 186 187 int I = BBToClusterMap[SrcBB]; 188 int J = BBToClusterMap[DstBB]; 189 190 // Case 2: If they are already allocated at the same cluster, just increase 191 // the weight of this cluster 192 if (I == J) { 193 if (ComputeEdges) 194 ClusterEdges[I][I] += E.Count; 195 LLVM_DEBUG(dbgs() << "\tIgnored (src, dst belong to the same cluster)\n"); 196 continue; 197 } 198 199 std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I]; 200 std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J]; 201 if (areClustersCompatible(ClusterA, ClusterB, E)) { 202 // Case 3: SrcBB is at the end of a cluster and DstBB is at the start, 203 // allowing us to merge two clusters. 204 for (BinaryBasicBlock *BB : ClusterB) 205 BBToClusterMap[BB] = I; 206 ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end()); 207 ClusterB.clear(); 208 if (ComputeEdges) { 209 // Increase the intra-cluster edge count of cluster A with the count of 210 // this edge as well as with the total count of previously visited edges 211 // from cluster B cluster A. 212 ClusterEdges[I][I] += E.Count; 213 ClusterEdges[I][I] += ClusterEdges[J][I]; 214 // Iterate through all inter-cluster edges and transfer edges targeting 215 // cluster B to cluster A. 216 for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K) 217 ClusterEdges[K][I] += ClusterEdges[K][J]; 218 } 219 // Adjust the weights of the remaining edges and re-sort the queue. 220 adjustQueue(Queue, BF); 221 LLVM_DEBUG(dbgs() << "\tMerged clusters of src, dst\n"); 222 } else { 223 // Case 4: Both SrcBB and DstBB are allocated in positions we cannot 224 // merge them. Add the count of this edge to the inter-cluster edge count 225 // between clusters A and B to help us decide ordering between these 226 // clusters. 227 if (ComputeEdges) 228 ClusterEdges[I][J] += E.Count; 229 LLVM_DEBUG( 230 dbgs() << "\tIgnored (src, dst belong to incompatible clusters)\n"); 231 } 232 } 233 } 234 235 void GreedyClusterAlgorithm::reset() { 236 ClusterAlgorithm::reset(); 237 BBToClusterMap.clear(); 238 } 239 240 void PHGreedyClusterAlgorithm::initQueue( 241 std::vector<EdgeTy> &Queue, const BinaryFunction &BF) { 242 // Define a comparison function to establish SWO between edges. 243 auto Comp = [&BF] (const EdgeTy &A, const EdgeTy &B) { 244 // With equal weights, prioritize branches with lower index 245 // source/destination. This helps to keep original block order for blocks 246 // when optimal order cannot be deducted from a profile. 247 if (A.Count == B.Count) { 248 const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src); 249 return (SrcOrder != 0) 250 ? SrcOrder > 0 251 : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0; 252 } 253 return A.Count < B.Count; 254 }; 255 256 // Sort edges in increasing profile count order. 257 std::sort(Queue.begin(), Queue.end(), Comp); 258 } 259 260 void PHGreedyClusterAlgorithm::adjustQueue( 261 std::vector<EdgeTy> &Queue, const BinaryFunction &BF) { 262 // Nothing to do. 263 return; 264 } 265 266 bool PHGreedyClusterAlgorithm::areClustersCompatible( 267 const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const { 268 return Front.back() == E.Src && Back.front() == E.Dst; 269 } 270 271 int64_t MinBranchGreedyClusterAlgorithm::calculateWeight( 272 const EdgeTy &E, const BinaryFunction &BF) const { 273 const BinaryBasicBlock *SrcBB = E.Src; 274 const BinaryBasicBlock *DstBB = E.Dst; 275 276 // Initial weight value. 277 int64_t W = (int64_t)E.Count; 278 279 // Adjust the weight by taking into account other edges with the same source. 280 auto BI = SrcBB->branch_info_begin(); 281 for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) { 282 assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && 283 "attempted reordering blocks of function with no profile data"); 284 assert(BI->Count <= std::numeric_limits<int64_t>::max() && 285 "overflow detected"); 286 // Ignore edges with same source and destination, edges that target the 287 // entry block as well as the edge E itself. 288 if (SuccBB != SrcBB && SuccBB != *BF.layout_begin() && SuccBB != DstBB) 289 W -= (int64_t)BI->Count; 290 ++BI; 291 } 292 293 // Adjust the weight by taking into account other edges with the same 294 // destination. 295 for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) { 296 // Ignore edges with same source and destination as well as the edge E 297 // itself. 298 if (PredBB == DstBB || PredBB == SrcBB) 299 continue; 300 auto BI = PredBB->branch_info_begin(); 301 for (const BinaryBasicBlock *SuccBB : PredBB->successors()) { 302 if (SuccBB == DstBB) 303 break; 304 ++BI; 305 } 306 assert(BI != PredBB->branch_info_end() && "invalid control flow graph"); 307 assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && 308 "attempted reordering blocks of function with no profile data"); 309 assert(BI->Count <= std::numeric_limits<int64_t>::max() && 310 "overflow detected"); 311 W -= (int64_t)BI->Count; 312 } 313 314 return W; 315 } 316 317 void MinBranchGreedyClusterAlgorithm::initQueue( 318 std::vector<EdgeTy> &Queue, const BinaryFunction &BF) { 319 // Initialize edge weights. 320 for (const EdgeTy &E : Queue) 321 Weight.emplace(std::make_pair(E, calculateWeight(E, BF))); 322 323 // Sort edges in increasing weight order. 324 adjustQueue(Queue, BF); 325 } 326 327 void MinBranchGreedyClusterAlgorithm::adjustQueue( 328 std::vector<EdgeTy> &Queue, const BinaryFunction &BF) { 329 // Define a comparison function to establish SWO between edges. 330 auto Comp = [&] (const EdgeTy &A, const EdgeTy &B) { 331 // With equal weights, prioritize branches with lower index 332 // source/destination. This helps to keep original block order for blocks 333 // when optimal order cannot be deduced from a profile. 334 if (Weight[A] == Weight[B]) { 335 const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src); 336 return (SrcOrder != 0) 337 ? SrcOrder > 0 338 : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0; 339 } 340 return Weight[A] < Weight[B]; 341 }; 342 343 // Iterate through all remaining edges to find edges that have their 344 // source and destination in the same cluster. 345 std::vector<EdgeTy> NewQueue; 346 for (const EdgeTy &E : Queue) { 347 const BinaryBasicBlock *SrcBB = E.Src; 348 const BinaryBasicBlock *DstBB = E.Dst; 349 350 // Case 1: SrcBB and DstBB are the same or DstBB is the entry block. Ignore 351 // this edge. 352 if (SrcBB == DstBB || DstBB == *BF.layout_begin()) { 353 LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs()); 354 dbgs() << " (same src, dst)\n"); 355 continue; 356 } 357 358 int I = BBToClusterMap[SrcBB]; 359 int J = BBToClusterMap[DstBB]; 360 std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I]; 361 std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J]; 362 363 // Case 2: They are already allocated at the same cluster or incompatible 364 // clusters. Adjust the weights of edges with the same source or 365 // destination, so that this edge has no effect on them any more, and ignore 366 // this edge. Also increase the intra- (or inter-) cluster edge count. 367 if (I == J || !areClustersCompatible(ClusterA, ClusterB, E)) { 368 if (!ClusterEdges.empty()) 369 ClusterEdges[I][J] += E.Count; 370 LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs()); 371 dbgs() << " (src, dst belong to same cluster or incompatible " 372 "clusters)\n"); 373 for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) { 374 if (SuccBB == DstBB) 375 continue; 376 auto WI = Weight.find(EdgeTy(SrcBB, SuccBB, 0)); 377 assert(WI != Weight.end() && "CFG edge not found in Weight map"); 378 WI->second += (int64_t)E.Count; 379 } 380 for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) { 381 if (PredBB == SrcBB) 382 continue; 383 auto WI = Weight.find(EdgeTy(PredBB, DstBB, 0)); 384 assert(WI != Weight.end() && "CFG edge not found in Weight map"); 385 WI->second += (int64_t)E.Count; 386 } 387 continue; 388 } 389 390 // Case 3: None of the previous cases is true, so just keep this edge in 391 // the queue. 392 NewQueue.emplace_back(E); 393 } 394 395 // Sort remaining edges in increasing weight order. 396 Queue.swap(NewQueue); 397 std::sort(Queue.begin(), Queue.end(), Comp); 398 } 399 400 bool MinBranchGreedyClusterAlgorithm::areClustersCompatible( 401 const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const { 402 return Front.back() == E.Src && Back.front() == E.Dst; 403 } 404 405 void MinBranchGreedyClusterAlgorithm::reset() { 406 GreedyClusterAlgorithm::reset(); 407 Weight.clear(); 408 } 409 410 void TSPReorderAlgorithm::reorderBasicBlocks( 411 const BinaryFunction &BF, BasicBlockOrder &Order) const { 412 std::vector<std::vector<uint64_t>> Weight; 413 std::vector<BinaryBasicBlock *> IndexToBB; 414 415 const size_t N = BF.layout_size(); 416 assert(N <= std::numeric_limits<uint64_t>::digits && 417 "cannot use TSP solution for sizes larger than bits in uint64_t"); 418 419 // Populating weight map and index map 420 for (BinaryBasicBlock *BB : BF.layout()) { 421 BB->setLayoutIndex(IndexToBB.size()); 422 IndexToBB.push_back(BB); 423 } 424 Weight.resize(N); 425 for (BinaryBasicBlock *BB : BF.layout()) { 426 auto BI = BB->branch_info_begin(); 427 Weight[BB->getLayoutIndex()].resize(N); 428 for (BinaryBasicBlock *SuccBB : BB->successors()) { 429 if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) 430 Weight[BB->getLayoutIndex()][SuccBB->getLayoutIndex()] = BI->Count; 431 ++BI; 432 } 433 } 434 435 std::vector<std::vector<int64_t>> DP; 436 DP.resize(1 << N); 437 for (std::vector<int64_t> &Elmt : DP) { 438 Elmt.resize(N, -1); 439 } 440 // Start with the entry basic block being allocated with cost zero 441 DP[1][0] = 0; 442 // Walk through TSP solutions using a bitmask to represent state (current set 443 // of BBs in the layout) 444 uint64_t BestSet = 1; 445 uint64_t BestLast = 0; 446 int64_t BestWeight = 0; 447 for (uint64_t Set = 1; Set < (1ULL << N); ++Set) { 448 // Traverse each possibility of Last BB visited in this layout 449 for (uint64_t Last = 0; Last < N; ++Last) { 450 // Case 1: There is no possible layout with this BB as Last 451 if (DP[Set][Last] == -1) 452 continue; 453 454 // Case 2: There is a layout with this Set and this Last, and we try 455 // to expand this set with New 456 for (uint64_t New = 1; New < N; ++New) { 457 // Case 2a: BB "New" is already in this Set 458 if ((Set & (1ULL << New)) != 0) 459 continue; 460 461 // Case 2b: BB "New" is not in this set and we add it to this Set and 462 // record total weight of this layout with "New" as the last BB. 463 uint64_t NewSet = (Set | (1ULL << New)); 464 if (DP[NewSet][New] == -1) 465 DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New]; 466 DP[NewSet][New] = std::max(DP[NewSet][New], 467 DP[Set][Last] + (int64_t)Weight[Last][New]); 468 469 if (DP[NewSet][New] > BestWeight) { 470 BestWeight = DP[NewSet][New]; 471 BestSet = NewSet; 472 BestLast = New; 473 } 474 } 475 } 476 } 477 478 // Define final function layout based on layout that maximizes weight 479 uint64_t Last = BestLast; 480 uint64_t Set = BestSet; 481 std::vector<bool> Visited; 482 Visited.resize(N); 483 Visited[Last] = true; 484 Order.push_back(IndexToBB[Last]); 485 Set = Set & ~(1ULL << Last); 486 while (Set != 0) { 487 int64_t Best = -1; 488 uint64_t NewLast; 489 for (uint64_t I = 0; I < N; ++I) { 490 if (DP[Set][I] == -1) 491 continue; 492 int64_t AdjWeight = Weight[I][Last] > 0 ? Weight[I][Last] : 0; 493 if (DP[Set][I] + AdjWeight > Best) { 494 NewLast = I; 495 Best = DP[Set][I] + AdjWeight; 496 } 497 } 498 Last = NewLast; 499 Visited[Last] = true; 500 Order.push_back(IndexToBB[Last]); 501 Set = Set & ~(1ULL << Last); 502 } 503 std::reverse(Order.begin(), Order.end()); 504 505 // Finalize layout with BBs that weren't assigned to the layout using the 506 // input layout. 507 for (BinaryBasicBlock *BB : BF.layout()) { 508 if (Visited[BB->getLayoutIndex()] == false) 509 Order.push_back(BB); 510 } 511 } 512 513 void OptimizeReorderAlgorithm::reorderBasicBlocks( 514 const BinaryFunction &BF, BasicBlockOrder &Order) const { 515 if (BF.layout_empty()) 516 return; 517 518 // Cluster basic blocks. 519 CAlgo->clusterBasicBlocks(BF); 520 521 if (opts::PrintClusters) 522 CAlgo->printClusters(); 523 524 // Arrange basic blocks according to clusters. 525 for (ClusterAlgorithm::ClusterTy &Cluster : CAlgo->Clusters) 526 Order.insert(Order.end(), Cluster.begin(), Cluster.end()); 527 } 528 529 void OptimizeBranchReorderAlgorithm::reorderBasicBlocks( 530 const BinaryFunction &BF, BasicBlockOrder &Order) const { 531 if (BF.layout_empty()) 532 return; 533 534 // Cluster basic blocks. 535 CAlgo->clusterBasicBlocks(BF, /* ComputeEdges = */true); 536 std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters; 537 std::vector<std::unordered_map<uint32_t, uint64_t>> &ClusterEdges = 538 CAlgo->ClusterEdges; 539 540 // Compute clusters' average frequencies. 541 CAlgo->computeClusterAverageFrequency(BF.getBinaryContext()); 542 std::vector<double> &AvgFreq = CAlgo->AvgFreq; 543 544 if (opts::PrintClusters) 545 CAlgo->printClusters(); 546 547 // Cluster layout order 548 std::vector<uint32_t> ClusterOrder; 549 550 // Do a topological sort for clusters, prioritizing frequently-executed BBs 551 // during the traversal. 552 std::stack<uint32_t> Stack; 553 std::vector<uint32_t> Status; 554 std::vector<uint32_t> Parent; 555 Status.resize(Clusters.size(), 0); 556 Parent.resize(Clusters.size(), 0); 557 constexpr uint32_t STACKED = 1; 558 constexpr uint32_t VISITED = 2; 559 Status[0] = STACKED; 560 Stack.push(0); 561 while (!Stack.empty()) { 562 uint32_t I = Stack.top(); 563 if (!(Status[I] & VISITED)) { 564 Status[I] |= VISITED; 565 // Order successors by weight 566 auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) { 567 return ClusterEdges[I][A] > ClusterEdges[I][B]; 568 }; 569 std::priority_queue<uint32_t, std::vector<uint32_t>, 570 decltype(ClusterComp)> SuccQueue(ClusterComp); 571 for (std::pair<const uint32_t, uint64_t> &Target : ClusterEdges[I]) { 572 if (Target.second > 0 && !(Status[Target.first] & STACKED) && 573 !Clusters[Target.first].empty()) { 574 Parent[Target.first] = I; 575 Status[Target.first] = STACKED; 576 SuccQueue.push(Target.first); 577 } 578 } 579 while (!SuccQueue.empty()) { 580 Stack.push(SuccQueue.top()); 581 SuccQueue.pop(); 582 } 583 continue; 584 } 585 // Already visited this node 586 Stack.pop(); 587 ClusterOrder.push_back(I); 588 } 589 std::reverse(ClusterOrder.begin(), ClusterOrder.end()); 590 // Put unreachable clusters at the end 591 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) 592 if (!(Status[I] & VISITED) && !Clusters[I].empty()) 593 ClusterOrder.push_back(I); 594 595 // Sort nodes with equal precedence 596 auto Beg = ClusterOrder.begin(); 597 // Don't reorder the first cluster, which contains the function entry point 598 ++Beg; 599 std::stable_sort(Beg, ClusterOrder.end(), 600 [&AvgFreq, &Parent](uint32_t A, uint32_t B) { 601 uint32_t P = Parent[A]; 602 while (Parent[P] != 0) { 603 if (Parent[P] == B) 604 return false; 605 P = Parent[P]; 606 } 607 P = Parent[B]; 608 while (Parent[P] != 0) { 609 if (Parent[P] == A) 610 return true; 611 P = Parent[P]; 612 } 613 return AvgFreq[A] > AvgFreq[B]; 614 }); 615 616 if (opts::PrintClusters) { 617 errs() << "New cluster order: "; 618 const char *Sep = ""; 619 for (uint32_t O : ClusterOrder) { 620 errs() << Sep << O; 621 Sep = ", "; 622 } 623 errs() << '\n'; 624 } 625 626 // Arrange basic blocks according to cluster order. 627 for (uint32_t ClusterIndex : ClusterOrder) { 628 ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; 629 Order.insert(Order.end(), Cluster.begin(), Cluster.end()); 630 } 631 } 632 633 void OptimizeCacheReorderAlgorithm::reorderBasicBlocks( 634 const BinaryFunction &BF, BasicBlockOrder &Order) const { 635 if (BF.layout_empty()) 636 return; 637 638 const uint64_t ColdThreshold = 639 opts::ColdThreshold * (*BF.layout_begin())->getExecutionCount() / 1000; 640 641 // Cluster basic blocks. 642 CAlgo->clusterBasicBlocks(BF); 643 std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters; 644 645 // Compute clusters' average frequencies. 646 CAlgo->computeClusterAverageFrequency(BF.getBinaryContext()); 647 std::vector<double> &AvgFreq = CAlgo->AvgFreq; 648 649 if (opts::PrintClusters) 650 CAlgo->printClusters(); 651 652 // Cluster layout order 653 std::vector<uint32_t> ClusterOrder; 654 655 // Order clusters based on average instruction execution frequency 656 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) 657 if (!Clusters[I].empty()) 658 ClusterOrder.push_back(I); 659 // Don't reorder the first cluster, which contains the function entry point 660 std::stable_sort(std::next(ClusterOrder.begin()), 661 ClusterOrder.end(), 662 [&AvgFreq](uint32_t A, uint32_t B) { 663 return AvgFreq[A] > AvgFreq[B]; 664 }); 665 666 if (opts::PrintClusters) { 667 errs() << "New cluster order: "; 668 const char *Sep = ""; 669 for (uint32_t O : ClusterOrder) { 670 errs() << Sep << O; 671 Sep = ", "; 672 } 673 errs() << '\n'; 674 } 675 676 // Arrange basic blocks according to cluster order. 677 for (uint32_t ClusterIndex : ClusterOrder) { 678 ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; 679 Order.insert(Order.end(), Cluster.begin(), Cluster.end()); 680 // Force zero execution count on clusters that do not meet the cut off 681 // specified by --cold-threshold. 682 if (AvgFreq[ClusterIndex] < static_cast<double>(ColdThreshold)) { 683 for (BinaryBasicBlock *BBPtr : Cluster) { 684 BBPtr->setExecutionCount(0); 685 } 686 } 687 } 688 } 689 690 void ReverseReorderAlgorithm::reorderBasicBlocks( 691 const BinaryFunction &BF, BasicBlockOrder &Order) const { 692 if (BF.layout_empty()) 693 return; 694 695 BinaryBasicBlock *FirstBB = *BF.layout_begin(); 696 Order.push_back(FirstBB); 697 for (auto RLI = BF.layout_rbegin(); *RLI != FirstBB; ++RLI) 698 Order.push_back(*RLI); 699 } 700 701 void RandomClusterReorderAlgorithm::reorderBasicBlocks( 702 const BinaryFunction &BF, BasicBlockOrder &Order) const { 703 if (BF.layout_empty()) 704 return; 705 706 // Cluster basic blocks. 707 CAlgo->clusterBasicBlocks(BF); 708 std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters; 709 710 if (opts::PrintClusters) 711 CAlgo->printClusters(); 712 713 // Cluster layout order 714 std::vector<uint32_t> ClusterOrder; 715 716 // Order clusters based on average instruction execution frequency 717 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) 718 if (!Clusters[I].empty()) 719 ClusterOrder.push_back(I); 720 721 std::shuffle(std::next(ClusterOrder.begin()), ClusterOrder.end(), 722 std::default_random_engine(opts::RandomSeed.getValue())); 723 724 if (opts::PrintClusters) { 725 errs() << "New cluster order: "; 726 const char *Sep = ""; 727 for (uint32_t O : ClusterOrder) { 728 errs() << Sep << O; 729 Sep = ", "; 730 } 731 errs() << '\n'; 732 } 733 734 // Arrange basic blocks according to cluster order. 735 for (uint32_t ClusterIndex : ClusterOrder) { 736 ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; 737 Order.insert(Order.end(), Cluster.begin(), Cluster.end()); 738 } 739 } 740