1 //===- bolt/Passes/ReorderAlgorithm.cpp - Basic block reordering ----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements classes used by several basic block reordering 10 // algorithms. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "bolt/Passes/ReorderAlgorithm.h" 15 #include "bolt/Core/BinaryBasicBlock.h" 16 #include "bolt/Core/BinaryFunction.h" 17 #include "llvm/Support/CommandLine.h" 18 #include <queue> 19 #include <random> 20 #include <stack> 21 22 #undef DEBUG_TYPE 23 #define DEBUG_TYPE "bolt" 24 25 using namespace llvm; 26 using namespace bolt; 27 28 namespace opts { 29 30 extern cl::OptionCategory BoltOptCategory; 31 extern cl::opt<bool> NoThreads; 32 33 static cl::opt<unsigned> ColdThreshold( 34 "cold-threshold", 35 cl::desc("tenths of percents of main entry frequency to use as a " 36 "threshold when evaluating whether a basic block is cold " 37 "(0 means it is only considered cold if the block has zero " 38 "samples). Default: 0 "), 39 cl::init(0), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory)); 40 41 static cl::opt<bool> 42 PrintClusters("print-clusters", 43 cl::desc("print clusters"), 44 cl::ZeroOrMore, 45 cl::Hidden, 46 cl::cat(BoltOptCategory)); 47 48 cl::opt<uint32_t> 49 RandomSeed("bolt-seed", 50 cl::desc("seed for randomization"), 51 cl::init(42), 52 cl::ZeroOrMore, 53 cl::Hidden, 54 cl::cat(BoltOptCategory)); 55 56 } // namespace opts 57 58 namespace { 59 60 template <class T> inline void hashCombine(size_t &Seed, const T &Val) { 61 std::hash<T> Hasher; 62 Seed ^= Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2); 63 } 64 65 template <typename A, typename B> struct HashPair { 66 size_t operator()(const std::pair<A, B> &Val) const { 67 std::hash<A> Hasher; 68 size_t Seed = Hasher(Val.first); 69 hashCombine(Seed, Val.second); 70 return Seed; 71 } 72 }; 73 74 } // namespace 75 76 void ClusterAlgorithm::computeClusterAverageFrequency(const BinaryContext &BC) { 77 // Create a separate MCCodeEmitter to allow lock-free execution 78 BinaryContext::IndependentCodeEmitter Emitter; 79 if (!opts::NoThreads) { 80 Emitter = BC.createIndependentMCCodeEmitter(); 81 } 82 83 AvgFreq.resize(Clusters.size(), 0.0); 84 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { 85 double Freq = 0.0; 86 uint64_t ClusterSize = 0; 87 for (BinaryBasicBlock *BB : Clusters[I]) { 88 if (BB->getNumNonPseudos() > 0) { 89 Freq += BB->getExecutionCount(); 90 // Estimate the size of a block in bytes at run time 91 // NOTE: This might be inaccurate 92 ClusterSize += BB->estimateSize(Emitter.MCE.get()); 93 } 94 } 95 AvgFreq[I] = ClusterSize == 0 ? 0 : Freq / ClusterSize; 96 } 97 } 98 99 void ClusterAlgorithm::printClusters() const { 100 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { 101 errs() << "Cluster number " << I; 102 if (AvgFreq.size() == Clusters.size()) 103 errs() << " (frequency: " << AvgFreq[I] << ")"; 104 errs() << " : "; 105 const char *Sep = ""; 106 for (BinaryBasicBlock *BB : Clusters[I]) { 107 errs() << Sep << BB->getName(); 108 Sep = ", "; 109 } 110 errs() << "\n"; 111 } 112 } 113 114 void ClusterAlgorithm::reset() { 115 Clusters.clear(); 116 ClusterEdges.clear(); 117 AvgFreq.clear(); 118 } 119 120 void GreedyClusterAlgorithm::EdgeTy::print(raw_ostream &OS) const { 121 OS << Src->getName() << " -> " << Dst->getName() << ", count: " << Count; 122 } 123 124 size_t GreedyClusterAlgorithm::EdgeHash::operator()(const EdgeTy &E) const { 125 HashPair<const BinaryBasicBlock *, const BinaryBasicBlock *> Hasher; 126 return Hasher(std::make_pair(E.Src, E.Dst)); 127 } 128 129 bool GreedyClusterAlgorithm::EdgeEqual::operator()(const EdgeTy &A, 130 const EdgeTy &B) const { 131 return A.Src == B.Src && A.Dst == B.Dst; 132 } 133 134 void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF, 135 bool ComputeEdges) { 136 reset(); 137 138 // Greedy heuristic implementation for the TSP, applied to BB layout. Try to 139 // maximize weight during a path traversing all BBs. In this way, we will 140 // convert the hottest branches into fall-throughs. 141 142 // This is the queue of edges from which we will pop edges and use them to 143 // cluster basic blocks in a greedy fashion. 144 std::vector<EdgeTy> Queue; 145 146 // Initialize inter-cluster weights. 147 if (ComputeEdges) 148 ClusterEdges.resize(BF.layout_size()); 149 150 // Initialize clusters and edge queue. 151 for (BinaryBasicBlock *BB : BF.layout()) { 152 // Create a cluster for this BB. 153 uint32_t I = Clusters.size(); 154 Clusters.emplace_back(); 155 std::vector<BinaryBasicBlock *> &Cluster = Clusters.back(); 156 Cluster.push_back(BB); 157 BBToClusterMap[BB] = I; 158 // Populate priority queue with edges. 159 auto BI = BB->branch_info_begin(); 160 for (BinaryBasicBlock *&I : BB->successors()) { 161 assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && 162 "attempted reordering blocks of function with no profile data"); 163 Queue.emplace_back(EdgeTy(BB, I, BI->Count)); 164 ++BI; 165 } 166 } 167 // Sort and adjust the edge queue. 168 initQueue(Queue, BF); 169 170 // Grow clusters in a greedy fashion. 171 while (!Queue.empty()) { 172 EdgeTy E = Queue.back(); 173 Queue.pop_back(); 174 175 const BinaryBasicBlock *SrcBB = E.Src; 176 const BinaryBasicBlock *DstBB = E.Dst; 177 178 LLVM_DEBUG(dbgs() << "Popped edge "; E.print(dbgs()); dbgs() << "\n"); 179 180 // Case 1: BBSrc and BBDst are the same. Ignore this edge 181 if (SrcBB == DstBB || DstBB == *BF.layout_begin()) { 182 LLVM_DEBUG(dbgs() << "\tIgnored (same src, dst)\n"); 183 continue; 184 } 185 186 int I = BBToClusterMap[SrcBB]; 187 int J = BBToClusterMap[DstBB]; 188 189 // Case 2: If they are already allocated at the same cluster, just increase 190 // the weight of this cluster 191 if (I == J) { 192 if (ComputeEdges) 193 ClusterEdges[I][I] += E.Count; 194 LLVM_DEBUG(dbgs() << "\tIgnored (src, dst belong to the same cluster)\n"); 195 continue; 196 } 197 198 std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I]; 199 std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J]; 200 if (areClustersCompatible(ClusterA, ClusterB, E)) { 201 // Case 3: SrcBB is at the end of a cluster and DstBB is at the start, 202 // allowing us to merge two clusters. 203 for (BinaryBasicBlock *BB : ClusterB) 204 BBToClusterMap[BB] = I; 205 ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end()); 206 ClusterB.clear(); 207 if (ComputeEdges) { 208 // Increase the intra-cluster edge count of cluster A with the count of 209 // this edge as well as with the total count of previously visited edges 210 // from cluster B cluster A. 211 ClusterEdges[I][I] += E.Count; 212 ClusterEdges[I][I] += ClusterEdges[J][I]; 213 // Iterate through all inter-cluster edges and transfer edges targeting 214 // cluster B to cluster A. 215 for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K) 216 ClusterEdges[K][I] += ClusterEdges[K][J]; 217 } 218 // Adjust the weights of the remaining edges and re-sort the queue. 219 adjustQueue(Queue, BF); 220 LLVM_DEBUG(dbgs() << "\tMerged clusters of src, dst\n"); 221 } else { 222 // Case 4: Both SrcBB and DstBB are allocated in positions we cannot 223 // merge them. Add the count of this edge to the inter-cluster edge count 224 // between clusters A and B to help us decide ordering between these 225 // clusters. 226 if (ComputeEdges) 227 ClusterEdges[I][J] += E.Count; 228 LLVM_DEBUG( 229 dbgs() << "\tIgnored (src, dst belong to incompatible clusters)\n"); 230 } 231 } 232 } 233 234 void GreedyClusterAlgorithm::reset() { 235 ClusterAlgorithm::reset(); 236 BBToClusterMap.clear(); 237 } 238 239 void PHGreedyClusterAlgorithm::initQueue(std::vector<EdgeTy> &Queue, 240 const BinaryFunction &BF) { 241 // Define a comparison function to establish SWO between edges. 242 auto Comp = [&BF](const EdgeTy &A, const EdgeTy &B) { 243 // With equal weights, prioritize branches with lower index 244 // source/destination. This helps to keep original block order for blocks 245 // when optimal order cannot be deducted from a profile. 246 if (A.Count == B.Count) { 247 const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src); 248 return (SrcOrder != 0) 249 ? SrcOrder > 0 250 : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0; 251 } 252 return A.Count < B.Count; 253 }; 254 255 // Sort edges in increasing profile count order. 256 std::sort(Queue.begin(), Queue.end(), Comp); 257 } 258 259 void PHGreedyClusterAlgorithm::adjustQueue(std::vector<EdgeTy> &Queue, 260 const BinaryFunction &BF) { 261 // Nothing to do. 262 return; 263 } 264 265 bool PHGreedyClusterAlgorithm::areClustersCompatible(const ClusterTy &Front, 266 const ClusterTy &Back, 267 const EdgeTy &E) const { 268 return Front.back() == E.Src && Back.front() == E.Dst; 269 } 270 271 int64_t MinBranchGreedyClusterAlgorithm::calculateWeight( 272 const EdgeTy &E, const BinaryFunction &BF) const { 273 const BinaryBasicBlock *SrcBB = E.Src; 274 const BinaryBasicBlock *DstBB = E.Dst; 275 276 // Initial weight value. 277 int64_t W = (int64_t)E.Count; 278 279 // Adjust the weight by taking into account other edges with the same source. 280 auto BI = SrcBB->branch_info_begin(); 281 for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) { 282 assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && 283 "attempted reordering blocks of function with no profile data"); 284 assert(BI->Count <= std::numeric_limits<int64_t>::max() && 285 "overflow detected"); 286 // Ignore edges with same source and destination, edges that target the 287 // entry block as well as the edge E itself. 288 if (SuccBB != SrcBB && SuccBB != *BF.layout_begin() && SuccBB != DstBB) 289 W -= (int64_t)BI->Count; 290 ++BI; 291 } 292 293 // Adjust the weight by taking into account other edges with the same 294 // destination. 295 for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) { 296 // Ignore edges with same source and destination as well as the edge E 297 // itself. 298 if (PredBB == DstBB || PredBB == SrcBB) 299 continue; 300 auto BI = PredBB->branch_info_begin(); 301 for (const BinaryBasicBlock *SuccBB : PredBB->successors()) { 302 if (SuccBB == DstBB) 303 break; 304 ++BI; 305 } 306 assert(BI != PredBB->branch_info_end() && "invalid control flow graph"); 307 assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && 308 "attempted reordering blocks of function with no profile data"); 309 assert(BI->Count <= std::numeric_limits<int64_t>::max() && 310 "overflow detected"); 311 W -= (int64_t)BI->Count; 312 } 313 314 return W; 315 } 316 317 void MinBranchGreedyClusterAlgorithm::initQueue(std::vector<EdgeTy> &Queue, 318 const BinaryFunction &BF) { 319 // Initialize edge weights. 320 for (const EdgeTy &E : Queue) 321 Weight.emplace(std::make_pair(E, calculateWeight(E, BF))); 322 323 // Sort edges in increasing weight order. 324 adjustQueue(Queue, BF); 325 } 326 327 void MinBranchGreedyClusterAlgorithm::adjustQueue(std::vector<EdgeTy> &Queue, 328 const BinaryFunction &BF) { 329 // Define a comparison function to establish SWO between edges. 330 auto Comp = [&](const EdgeTy &A, const EdgeTy &B) { 331 // With equal weights, prioritize branches with lower index 332 // source/destination. This helps to keep original block order for blocks 333 // when optimal order cannot be deduced from a profile. 334 if (Weight[A] == Weight[B]) { 335 const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src); 336 return (SrcOrder != 0) 337 ? SrcOrder > 0 338 : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0; 339 } 340 return Weight[A] < Weight[B]; 341 }; 342 343 // Iterate through all remaining edges to find edges that have their 344 // source and destination in the same cluster. 345 std::vector<EdgeTy> NewQueue; 346 for (const EdgeTy &E : Queue) { 347 const BinaryBasicBlock *SrcBB = E.Src; 348 const BinaryBasicBlock *DstBB = E.Dst; 349 350 // Case 1: SrcBB and DstBB are the same or DstBB is the entry block. Ignore 351 // this edge. 352 if (SrcBB == DstBB || DstBB == *BF.layout_begin()) { 353 LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs()); 354 dbgs() << " (same src, dst)\n"); 355 continue; 356 } 357 358 int I = BBToClusterMap[SrcBB]; 359 int J = BBToClusterMap[DstBB]; 360 std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I]; 361 std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J]; 362 363 // Case 2: They are already allocated at the same cluster or incompatible 364 // clusters. Adjust the weights of edges with the same source or 365 // destination, so that this edge has no effect on them any more, and ignore 366 // this edge. Also increase the intra- (or inter-) cluster edge count. 367 if (I == J || !areClustersCompatible(ClusterA, ClusterB, E)) { 368 if (!ClusterEdges.empty()) 369 ClusterEdges[I][J] += E.Count; 370 LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs()); 371 dbgs() << " (src, dst belong to same cluster or incompatible " 372 "clusters)\n"); 373 for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) { 374 if (SuccBB == DstBB) 375 continue; 376 auto WI = Weight.find(EdgeTy(SrcBB, SuccBB, 0)); 377 assert(WI != Weight.end() && "CFG edge not found in Weight map"); 378 WI->second += (int64_t)E.Count; 379 } 380 for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) { 381 if (PredBB == SrcBB) 382 continue; 383 auto WI = Weight.find(EdgeTy(PredBB, DstBB, 0)); 384 assert(WI != Weight.end() && "CFG edge not found in Weight map"); 385 WI->second += (int64_t)E.Count; 386 } 387 continue; 388 } 389 390 // Case 3: None of the previous cases is true, so just keep this edge in 391 // the queue. 392 NewQueue.emplace_back(E); 393 } 394 395 // Sort remaining edges in increasing weight order. 396 Queue.swap(NewQueue); 397 std::sort(Queue.begin(), Queue.end(), Comp); 398 } 399 400 bool MinBranchGreedyClusterAlgorithm::areClustersCompatible( 401 const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const { 402 return Front.back() == E.Src && Back.front() == E.Dst; 403 } 404 405 void MinBranchGreedyClusterAlgorithm::reset() { 406 GreedyClusterAlgorithm::reset(); 407 Weight.clear(); 408 } 409 410 void TSPReorderAlgorithm::reorderBasicBlocks(const BinaryFunction &BF, 411 BasicBlockOrder &Order) const { 412 std::vector<std::vector<uint64_t>> Weight; 413 std::vector<BinaryBasicBlock *> IndexToBB; 414 415 const size_t N = BF.layout_size(); 416 assert(N <= std::numeric_limits<uint64_t>::digits && 417 "cannot use TSP solution for sizes larger than bits in uint64_t"); 418 419 // Populating weight map and index map 420 for (BinaryBasicBlock *BB : BF.layout()) { 421 BB->setLayoutIndex(IndexToBB.size()); 422 IndexToBB.push_back(BB); 423 } 424 Weight.resize(N); 425 for (BinaryBasicBlock *BB : BF.layout()) { 426 auto BI = BB->branch_info_begin(); 427 Weight[BB->getLayoutIndex()].resize(N); 428 for (BinaryBasicBlock *SuccBB : BB->successors()) { 429 if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) 430 Weight[BB->getLayoutIndex()][SuccBB->getLayoutIndex()] = BI->Count; 431 ++BI; 432 } 433 } 434 435 std::vector<std::vector<int64_t>> DP; 436 DP.resize(1 << N); 437 for (std::vector<int64_t> &Elmt : DP) { 438 Elmt.resize(N, -1); 439 } 440 // Start with the entry basic block being allocated with cost zero 441 DP[1][0] = 0; 442 // Walk through TSP solutions using a bitmask to represent state (current set 443 // of BBs in the layout) 444 uint64_t BestSet = 1; 445 uint64_t BestLast = 0; 446 int64_t BestWeight = 0; 447 for (uint64_t Set = 1; Set < (1ULL << N); ++Set) { 448 // Traverse each possibility of Last BB visited in this layout 449 for (uint64_t Last = 0; Last < N; ++Last) { 450 // Case 1: There is no possible layout with this BB as Last 451 if (DP[Set][Last] == -1) 452 continue; 453 454 // Case 2: There is a layout with this Set and this Last, and we try 455 // to expand this set with New 456 for (uint64_t New = 1; New < N; ++New) { 457 // Case 2a: BB "New" is already in this Set 458 if ((Set & (1ULL << New)) != 0) 459 continue; 460 461 // Case 2b: BB "New" is not in this set and we add it to this Set and 462 // record total weight of this layout with "New" as the last BB. 463 uint64_t NewSet = (Set | (1ULL << New)); 464 if (DP[NewSet][New] == -1) 465 DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New]; 466 DP[NewSet][New] = std::max(DP[NewSet][New], 467 DP[Set][Last] + (int64_t)Weight[Last][New]); 468 469 if (DP[NewSet][New] > BestWeight) { 470 BestWeight = DP[NewSet][New]; 471 BestSet = NewSet; 472 BestLast = New; 473 } 474 } 475 } 476 } 477 478 // Define final function layout based on layout that maximizes weight 479 uint64_t Last = BestLast; 480 uint64_t Set = BestSet; 481 std::vector<bool> Visited; 482 Visited.resize(N); 483 Visited[Last] = true; 484 Order.push_back(IndexToBB[Last]); 485 Set = Set & ~(1ULL << Last); 486 while (Set != 0) { 487 int64_t Best = -1; 488 uint64_t NewLast; 489 for (uint64_t I = 0; I < N; ++I) { 490 if (DP[Set][I] == -1) 491 continue; 492 int64_t AdjWeight = Weight[I][Last] > 0 ? Weight[I][Last] : 0; 493 if (DP[Set][I] + AdjWeight > Best) { 494 NewLast = I; 495 Best = DP[Set][I] + AdjWeight; 496 } 497 } 498 Last = NewLast; 499 Visited[Last] = true; 500 Order.push_back(IndexToBB[Last]); 501 Set = Set & ~(1ULL << Last); 502 } 503 std::reverse(Order.begin(), Order.end()); 504 505 // Finalize layout with BBs that weren't assigned to the layout using the 506 // input layout. 507 for (BinaryBasicBlock *BB : BF.layout()) { 508 if (Visited[BB->getLayoutIndex()] == false) 509 Order.push_back(BB); 510 } 511 } 512 513 void OptimizeReorderAlgorithm::reorderBasicBlocks( 514 const BinaryFunction &BF, BasicBlockOrder &Order) const { 515 if (BF.layout_empty()) 516 return; 517 518 // Cluster basic blocks. 519 CAlgo->clusterBasicBlocks(BF); 520 521 if (opts::PrintClusters) 522 CAlgo->printClusters(); 523 524 // Arrange basic blocks according to clusters. 525 for (ClusterAlgorithm::ClusterTy &Cluster : CAlgo->Clusters) 526 Order.insert(Order.end(), Cluster.begin(), Cluster.end()); 527 } 528 529 void OptimizeBranchReorderAlgorithm::reorderBasicBlocks( 530 const BinaryFunction &BF, BasicBlockOrder &Order) const { 531 if (BF.layout_empty()) 532 return; 533 534 // Cluster basic blocks. 535 CAlgo->clusterBasicBlocks(BF, /* ComputeEdges = */ true); 536 std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters; 537 std::vector<std::unordered_map<uint32_t, uint64_t>> &ClusterEdges = 538 CAlgo->ClusterEdges; 539 540 // Compute clusters' average frequencies. 541 CAlgo->computeClusterAverageFrequency(BF.getBinaryContext()); 542 std::vector<double> &AvgFreq = CAlgo->AvgFreq; 543 544 if (opts::PrintClusters) 545 CAlgo->printClusters(); 546 547 // Cluster layout order 548 std::vector<uint32_t> ClusterOrder; 549 550 // Do a topological sort for clusters, prioritizing frequently-executed BBs 551 // during the traversal. 552 std::stack<uint32_t> Stack; 553 std::vector<uint32_t> Status; 554 std::vector<uint32_t> Parent; 555 Status.resize(Clusters.size(), 0); 556 Parent.resize(Clusters.size(), 0); 557 constexpr uint32_t STACKED = 1; 558 constexpr uint32_t VISITED = 2; 559 Status[0] = STACKED; 560 Stack.push(0); 561 while (!Stack.empty()) { 562 uint32_t I = Stack.top(); 563 if (!(Status[I] & VISITED)) { 564 Status[I] |= VISITED; 565 // Order successors by weight 566 auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) { 567 return ClusterEdges[I][A] > ClusterEdges[I][B]; 568 }; 569 std::priority_queue<uint32_t, std::vector<uint32_t>, 570 decltype(ClusterComp)> 571 SuccQueue(ClusterComp); 572 for (std::pair<const uint32_t, uint64_t> &Target : ClusterEdges[I]) { 573 if (Target.second > 0 && !(Status[Target.first] & STACKED) && 574 !Clusters[Target.first].empty()) { 575 Parent[Target.first] = I; 576 Status[Target.first] = STACKED; 577 SuccQueue.push(Target.first); 578 } 579 } 580 while (!SuccQueue.empty()) { 581 Stack.push(SuccQueue.top()); 582 SuccQueue.pop(); 583 } 584 continue; 585 } 586 // Already visited this node 587 Stack.pop(); 588 ClusterOrder.push_back(I); 589 } 590 std::reverse(ClusterOrder.begin(), ClusterOrder.end()); 591 // Put unreachable clusters at the end 592 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) 593 if (!(Status[I] & VISITED) && !Clusters[I].empty()) 594 ClusterOrder.push_back(I); 595 596 // Sort nodes with equal precedence 597 auto Beg = ClusterOrder.begin(); 598 // Don't reorder the first cluster, which contains the function entry point 599 ++Beg; 600 std::stable_sort(Beg, ClusterOrder.end(), 601 [&AvgFreq, &Parent](uint32_t A, uint32_t B) { 602 uint32_t P = Parent[A]; 603 while (Parent[P] != 0) { 604 if (Parent[P] == B) 605 return false; 606 P = Parent[P]; 607 } 608 P = Parent[B]; 609 while (Parent[P] != 0) { 610 if (Parent[P] == A) 611 return true; 612 P = Parent[P]; 613 } 614 return AvgFreq[A] > AvgFreq[B]; 615 }); 616 617 if (opts::PrintClusters) { 618 errs() << "New cluster order: "; 619 const char *Sep = ""; 620 for (uint32_t O : ClusterOrder) { 621 errs() << Sep << O; 622 Sep = ", "; 623 } 624 errs() << '\n'; 625 } 626 627 // Arrange basic blocks according to cluster order. 628 for (uint32_t ClusterIndex : ClusterOrder) { 629 ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; 630 Order.insert(Order.end(), Cluster.begin(), Cluster.end()); 631 } 632 } 633 634 void OptimizeCacheReorderAlgorithm::reorderBasicBlocks( 635 const BinaryFunction &BF, BasicBlockOrder &Order) const { 636 if (BF.layout_empty()) 637 return; 638 639 const uint64_t ColdThreshold = 640 opts::ColdThreshold * (*BF.layout_begin())->getExecutionCount() / 1000; 641 642 // Cluster basic blocks. 643 CAlgo->clusterBasicBlocks(BF); 644 std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters; 645 646 // Compute clusters' average frequencies. 647 CAlgo->computeClusterAverageFrequency(BF.getBinaryContext()); 648 std::vector<double> &AvgFreq = CAlgo->AvgFreq; 649 650 if (opts::PrintClusters) 651 CAlgo->printClusters(); 652 653 // Cluster layout order 654 std::vector<uint32_t> ClusterOrder; 655 656 // Order clusters based on average instruction execution frequency 657 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) 658 if (!Clusters[I].empty()) 659 ClusterOrder.push_back(I); 660 // Don't reorder the first cluster, which contains the function entry point 661 std::stable_sort( 662 std::next(ClusterOrder.begin()), ClusterOrder.end(), 663 [&AvgFreq](uint32_t A, uint32_t B) { return AvgFreq[A] > AvgFreq[B]; }); 664 665 if (opts::PrintClusters) { 666 errs() << "New cluster order: "; 667 const char *Sep = ""; 668 for (uint32_t O : ClusterOrder) { 669 errs() << Sep << O; 670 Sep = ", "; 671 } 672 errs() << '\n'; 673 } 674 675 // Arrange basic blocks according to cluster order. 676 for (uint32_t ClusterIndex : ClusterOrder) { 677 ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; 678 Order.insert(Order.end(), Cluster.begin(), Cluster.end()); 679 // Force zero execution count on clusters that do not meet the cut off 680 // specified by --cold-threshold. 681 if (AvgFreq[ClusterIndex] < static_cast<double>(ColdThreshold)) { 682 for (BinaryBasicBlock *BBPtr : Cluster) { 683 BBPtr->setExecutionCount(0); 684 } 685 } 686 } 687 } 688 689 void ReverseReorderAlgorithm::reorderBasicBlocks(const BinaryFunction &BF, 690 BasicBlockOrder &Order) const { 691 if (BF.layout_empty()) 692 return; 693 694 BinaryBasicBlock *FirstBB = *BF.layout_begin(); 695 Order.push_back(FirstBB); 696 for (auto RLI = BF.layout_rbegin(); *RLI != FirstBB; ++RLI) 697 Order.push_back(*RLI); 698 } 699 700 void RandomClusterReorderAlgorithm::reorderBasicBlocks( 701 const BinaryFunction &BF, BasicBlockOrder &Order) const { 702 if (BF.layout_empty()) 703 return; 704 705 // Cluster basic blocks. 706 CAlgo->clusterBasicBlocks(BF); 707 std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters; 708 709 if (opts::PrintClusters) 710 CAlgo->printClusters(); 711 712 // Cluster layout order 713 std::vector<uint32_t> ClusterOrder; 714 715 // Order clusters based on average instruction execution frequency 716 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) 717 if (!Clusters[I].empty()) 718 ClusterOrder.push_back(I); 719 720 std::shuffle(std::next(ClusterOrder.begin()), ClusterOrder.end(), 721 std::default_random_engine(opts::RandomSeed.getValue())); 722 723 if (opts::PrintClusters) { 724 errs() << "New cluster order: "; 725 const char *Sep = ""; 726 for (uint32_t O : ClusterOrder) { 727 errs() << Sep << O; 728 Sep = ", "; 729 } 730 errs() << '\n'; 731 } 732 733 // Arrange basic blocks according to cluster order. 734 for (uint32_t ClusterIndex : ClusterOrder) { 735 ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; 736 Order.insert(Order.end(), Cluster.begin(), Cluster.end()); 737 } 738 } 739