1 //===- bolt/Passes/ReorderAlgorithm.cpp - Basic block reordering ----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements classes used by several basic block reordering 10 // algorithms. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "bolt/Passes/ReorderAlgorithm.h" 15 #include "bolt/Core/BinaryBasicBlock.h" 16 #include "bolt/Core/BinaryFunction.h" 17 #include "llvm/Support/CommandLine.h" 18 #include <queue> 19 #include <random> 20 #include <stack> 21 22 #undef DEBUG_TYPE 23 #define DEBUG_TYPE "bolt" 24 25 using namespace llvm; 26 using namespace bolt; 27 28 namespace opts { 29 30 extern cl::OptionCategory BoltOptCategory; 31 extern cl::opt<bool> NoThreads; 32 33 static cl::opt<unsigned> ColdThreshold( 34 "cold-threshold", 35 cl::desc("tenths of percents of main entry frequency to use as a " 36 "threshold when evaluating whether a basic block is cold " 37 "(0 means it is only considered cold if the block has zero " 38 "samples). Default: 0 "), 39 cl::init(0), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory)); 40 41 static cl::opt<bool> PrintClusters("print-clusters", cl::desc("print clusters"), 42 cl::Hidden, cl::cat(BoltOptCategory)); 43 44 cl::opt<uint32_t> RandomSeed("bolt-seed", cl::desc("seed for randomization"), 45 cl::init(42), cl::Hidden, 46 cl::cat(BoltOptCategory)); 47 48 } // namespace opts 49 50 namespace { 51 52 template <class T> inline void hashCombine(size_t &Seed, const T &Val) { 53 std::hash<T> Hasher; 54 Seed ^= Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2); 55 } 56 57 template <typename A, typename B> struct HashPair { 58 size_t operator()(const std::pair<A, B> &Val) const { 59 std::hash<A> Hasher; 60 size_t Seed = Hasher(Val.first); 61 hashCombine(Seed, Val.second); 62 return Seed; 63 } 64 }; 65 66 } // namespace 67 68 void ClusterAlgorithm::computeClusterAverageFrequency(const BinaryContext &BC) { 69 // Create a separate MCCodeEmitter to allow lock-free execution 70 BinaryContext::IndependentCodeEmitter Emitter; 71 if (!opts::NoThreads) 72 Emitter = BC.createIndependentMCCodeEmitter(); 73 74 AvgFreq.resize(Clusters.size(), 0.0); 75 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { 76 double Freq = 0.0; 77 uint64_t ClusterSize = 0; 78 for (BinaryBasicBlock *BB : Clusters[I]) { 79 if (BB->getNumNonPseudos() > 0) { 80 Freq += BB->getExecutionCount(); 81 // Estimate the size of a block in bytes at run time 82 // NOTE: This might be inaccurate 83 ClusterSize += BB->estimateSize(Emitter.MCE.get()); 84 } 85 } 86 AvgFreq[I] = ClusterSize == 0 ? 0 : Freq / ClusterSize; 87 } 88 } 89 90 void ClusterAlgorithm::printClusters() const { 91 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) { 92 errs() << "Cluster number " << I; 93 if (AvgFreq.size() == Clusters.size()) 94 errs() << " (frequency: " << AvgFreq[I] << ")"; 95 errs() << " : "; 96 const char *Sep = ""; 97 for (BinaryBasicBlock *BB : Clusters[I]) { 98 errs() << Sep << BB->getName(); 99 Sep = ", "; 100 } 101 errs() << "\n"; 102 } 103 } 104 105 void ClusterAlgorithm::reset() { 106 Clusters.clear(); 107 ClusterEdges.clear(); 108 AvgFreq.clear(); 109 } 110 111 void GreedyClusterAlgorithm::EdgeTy::print(raw_ostream &OS) const { 112 OS << Src->getName() << " -> " << Dst->getName() << ", count: " << Count; 113 } 114 115 size_t GreedyClusterAlgorithm::EdgeHash::operator()(const EdgeTy &E) const { 116 HashPair<const BinaryBasicBlock *, const BinaryBasicBlock *> Hasher; 117 return Hasher(std::make_pair(E.Src, E.Dst)); 118 } 119 120 bool GreedyClusterAlgorithm::EdgeEqual::operator()(const EdgeTy &A, 121 const EdgeTy &B) const { 122 return A.Src == B.Src && A.Dst == B.Dst; 123 } 124 125 void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF, 126 bool ComputeEdges) { 127 reset(); 128 129 // Greedy heuristic implementation for the TSP, applied to BB layout. Try to 130 // maximize weight during a path traversing all BBs. In this way, we will 131 // convert the hottest branches into fall-throughs. 132 133 // This is the queue of edges from which we will pop edges and use them to 134 // cluster basic blocks in a greedy fashion. 135 std::vector<EdgeTy> Queue; 136 137 // Initialize inter-cluster weights. 138 if (ComputeEdges) 139 ClusterEdges.resize(BF.layout_size()); 140 141 // Initialize clusters and edge queue. 142 for (BinaryBasicBlock *BB : BF.layout()) { 143 // Create a cluster for this BB. 144 uint32_t I = Clusters.size(); 145 Clusters.emplace_back(); 146 std::vector<BinaryBasicBlock *> &Cluster = Clusters.back(); 147 Cluster.push_back(BB); 148 BBToClusterMap[BB] = I; 149 // Populate priority queue with edges. 150 auto BI = BB->branch_info_begin(); 151 for (BinaryBasicBlock *&I : BB->successors()) { 152 assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && 153 "attempted reordering blocks of function with no profile data"); 154 Queue.emplace_back(EdgeTy(BB, I, BI->Count)); 155 ++BI; 156 } 157 } 158 // Sort and adjust the edge queue. 159 initQueue(Queue, BF); 160 161 // Grow clusters in a greedy fashion. 162 while (!Queue.empty()) { 163 EdgeTy E = Queue.back(); 164 Queue.pop_back(); 165 166 const BinaryBasicBlock *SrcBB = E.Src; 167 const BinaryBasicBlock *DstBB = E.Dst; 168 169 LLVM_DEBUG(dbgs() << "Popped edge "; E.print(dbgs()); dbgs() << "\n"); 170 171 // Case 1: BBSrc and BBDst are the same. Ignore this edge 172 if (SrcBB == DstBB || DstBB == *BF.layout_begin()) { 173 LLVM_DEBUG(dbgs() << "\tIgnored (same src, dst)\n"); 174 continue; 175 } 176 177 int I = BBToClusterMap[SrcBB]; 178 int J = BBToClusterMap[DstBB]; 179 180 // Case 2: If they are already allocated at the same cluster, just increase 181 // the weight of this cluster 182 if (I == J) { 183 if (ComputeEdges) 184 ClusterEdges[I][I] += E.Count; 185 LLVM_DEBUG(dbgs() << "\tIgnored (src, dst belong to the same cluster)\n"); 186 continue; 187 } 188 189 std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I]; 190 std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J]; 191 if (areClustersCompatible(ClusterA, ClusterB, E)) { 192 // Case 3: SrcBB is at the end of a cluster and DstBB is at the start, 193 // allowing us to merge two clusters. 194 for (BinaryBasicBlock *BB : ClusterB) 195 BBToClusterMap[BB] = I; 196 ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end()); 197 ClusterB.clear(); 198 if (ComputeEdges) { 199 // Increase the intra-cluster edge count of cluster A with the count of 200 // this edge as well as with the total count of previously visited edges 201 // from cluster B cluster A. 202 ClusterEdges[I][I] += E.Count; 203 ClusterEdges[I][I] += ClusterEdges[J][I]; 204 // Iterate through all inter-cluster edges and transfer edges targeting 205 // cluster B to cluster A. 206 for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K) 207 ClusterEdges[K][I] += ClusterEdges[K][J]; 208 } 209 // Adjust the weights of the remaining edges and re-sort the queue. 210 adjustQueue(Queue, BF); 211 LLVM_DEBUG(dbgs() << "\tMerged clusters of src, dst\n"); 212 } else { 213 // Case 4: Both SrcBB and DstBB are allocated in positions we cannot 214 // merge them. Add the count of this edge to the inter-cluster edge count 215 // between clusters A and B to help us decide ordering between these 216 // clusters. 217 if (ComputeEdges) 218 ClusterEdges[I][J] += E.Count; 219 LLVM_DEBUG( 220 dbgs() << "\tIgnored (src, dst belong to incompatible clusters)\n"); 221 } 222 } 223 } 224 225 void GreedyClusterAlgorithm::reset() { 226 ClusterAlgorithm::reset(); 227 BBToClusterMap.clear(); 228 } 229 230 void PHGreedyClusterAlgorithm::initQueue(std::vector<EdgeTy> &Queue, 231 const BinaryFunction &BF) { 232 // Define a comparison function to establish SWO between edges. 233 auto Comp = [&BF](const EdgeTy &A, const EdgeTy &B) { 234 // With equal weights, prioritize branches with lower index 235 // source/destination. This helps to keep original block order for blocks 236 // when optimal order cannot be deducted from a profile. 237 if (A.Count == B.Count) { 238 const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src); 239 return (SrcOrder != 0) 240 ? SrcOrder > 0 241 : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0; 242 } 243 return A.Count < B.Count; 244 }; 245 246 // Sort edges in increasing profile count order. 247 std::sort(Queue.begin(), Queue.end(), Comp); 248 } 249 250 void PHGreedyClusterAlgorithm::adjustQueue(std::vector<EdgeTy> &Queue, 251 const BinaryFunction &BF) { 252 // Nothing to do. 253 return; 254 } 255 256 bool PHGreedyClusterAlgorithm::areClustersCompatible(const ClusterTy &Front, 257 const ClusterTy &Back, 258 const EdgeTy &E) const { 259 return Front.back() == E.Src && Back.front() == E.Dst; 260 } 261 262 int64_t MinBranchGreedyClusterAlgorithm::calculateWeight( 263 const EdgeTy &E, const BinaryFunction &BF) const { 264 const BinaryBasicBlock *SrcBB = E.Src; 265 const BinaryBasicBlock *DstBB = E.Dst; 266 267 // Initial weight value. 268 int64_t W = (int64_t)E.Count; 269 270 // Adjust the weight by taking into account other edges with the same source. 271 auto BI = SrcBB->branch_info_begin(); 272 for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) { 273 assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && 274 "attempted reordering blocks of function with no profile data"); 275 assert(BI->Count <= std::numeric_limits<int64_t>::max() && 276 "overflow detected"); 277 // Ignore edges with same source and destination, edges that target the 278 // entry block as well as the edge E itself. 279 if (SuccBB != SrcBB && SuccBB != *BF.layout_begin() && SuccBB != DstBB) 280 W -= (int64_t)BI->Count; 281 ++BI; 282 } 283 284 // Adjust the weight by taking into account other edges with the same 285 // destination. 286 for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) { 287 // Ignore edges with same source and destination as well as the edge E 288 // itself. 289 if (PredBB == DstBB || PredBB == SrcBB) 290 continue; 291 auto BI = PredBB->branch_info_begin(); 292 for (const BinaryBasicBlock *SuccBB : PredBB->successors()) { 293 if (SuccBB == DstBB) 294 break; 295 ++BI; 296 } 297 assert(BI != PredBB->branch_info_end() && "invalid control flow graph"); 298 assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && 299 "attempted reordering blocks of function with no profile data"); 300 assert(BI->Count <= std::numeric_limits<int64_t>::max() && 301 "overflow detected"); 302 W -= (int64_t)BI->Count; 303 } 304 305 return W; 306 } 307 308 void MinBranchGreedyClusterAlgorithm::initQueue(std::vector<EdgeTy> &Queue, 309 const BinaryFunction &BF) { 310 // Initialize edge weights. 311 for (const EdgeTy &E : Queue) 312 Weight.emplace(std::make_pair(E, calculateWeight(E, BF))); 313 314 // Sort edges in increasing weight order. 315 adjustQueue(Queue, BF); 316 } 317 318 void MinBranchGreedyClusterAlgorithm::adjustQueue(std::vector<EdgeTy> &Queue, 319 const BinaryFunction &BF) { 320 // Define a comparison function to establish SWO between edges. 321 auto Comp = [&](const EdgeTy &A, const EdgeTy &B) { 322 // With equal weights, prioritize branches with lower index 323 // source/destination. This helps to keep original block order for blocks 324 // when optimal order cannot be deduced from a profile. 325 if (Weight[A] == Weight[B]) { 326 const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src); 327 return (SrcOrder != 0) 328 ? SrcOrder > 0 329 : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0; 330 } 331 return Weight[A] < Weight[B]; 332 }; 333 334 // Iterate through all remaining edges to find edges that have their 335 // source and destination in the same cluster. 336 std::vector<EdgeTy> NewQueue; 337 for (const EdgeTy &E : Queue) { 338 const BinaryBasicBlock *SrcBB = E.Src; 339 const BinaryBasicBlock *DstBB = E.Dst; 340 341 // Case 1: SrcBB and DstBB are the same or DstBB is the entry block. Ignore 342 // this edge. 343 if (SrcBB == DstBB || DstBB == *BF.layout_begin()) { 344 LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs()); 345 dbgs() << " (same src, dst)\n"); 346 continue; 347 } 348 349 int I = BBToClusterMap[SrcBB]; 350 int J = BBToClusterMap[DstBB]; 351 std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I]; 352 std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J]; 353 354 // Case 2: They are already allocated at the same cluster or incompatible 355 // clusters. Adjust the weights of edges with the same source or 356 // destination, so that this edge has no effect on them any more, and ignore 357 // this edge. Also increase the intra- (or inter-) cluster edge count. 358 if (I == J || !areClustersCompatible(ClusterA, ClusterB, E)) { 359 if (!ClusterEdges.empty()) 360 ClusterEdges[I][J] += E.Count; 361 LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs()); 362 dbgs() << " (src, dst belong to same cluster or incompatible " 363 "clusters)\n"); 364 for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) { 365 if (SuccBB == DstBB) 366 continue; 367 auto WI = Weight.find(EdgeTy(SrcBB, SuccBB, 0)); 368 assert(WI != Weight.end() && "CFG edge not found in Weight map"); 369 WI->second += (int64_t)E.Count; 370 } 371 for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) { 372 if (PredBB == SrcBB) 373 continue; 374 auto WI = Weight.find(EdgeTy(PredBB, DstBB, 0)); 375 assert(WI != Weight.end() && "CFG edge not found in Weight map"); 376 WI->second += (int64_t)E.Count; 377 } 378 continue; 379 } 380 381 // Case 3: None of the previous cases is true, so just keep this edge in 382 // the queue. 383 NewQueue.emplace_back(E); 384 } 385 386 // Sort remaining edges in increasing weight order. 387 Queue.swap(NewQueue); 388 std::sort(Queue.begin(), Queue.end(), Comp); 389 } 390 391 bool MinBranchGreedyClusterAlgorithm::areClustersCompatible( 392 const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const { 393 return Front.back() == E.Src && Back.front() == E.Dst; 394 } 395 396 void MinBranchGreedyClusterAlgorithm::reset() { 397 GreedyClusterAlgorithm::reset(); 398 Weight.clear(); 399 } 400 401 void TSPReorderAlgorithm::reorderBasicBlocks(const BinaryFunction &BF, 402 BasicBlockOrder &Order) const { 403 std::vector<std::vector<uint64_t>> Weight; 404 std::vector<BinaryBasicBlock *> IndexToBB; 405 406 const size_t N = BF.layout_size(); 407 assert(N <= std::numeric_limits<uint64_t>::digits && 408 "cannot use TSP solution for sizes larger than bits in uint64_t"); 409 410 // Populating weight map and index map 411 for (BinaryBasicBlock *BB : BF.layout()) { 412 BB->setLayoutIndex(IndexToBB.size()); 413 IndexToBB.push_back(BB); 414 } 415 Weight.resize(N); 416 for (BinaryBasicBlock *BB : BF.layout()) { 417 auto BI = BB->branch_info_begin(); 418 Weight[BB->getLayoutIndex()].resize(N); 419 for (BinaryBasicBlock *SuccBB : BB->successors()) { 420 if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) 421 Weight[BB->getLayoutIndex()][SuccBB->getLayoutIndex()] = BI->Count; 422 ++BI; 423 } 424 } 425 426 std::vector<std::vector<int64_t>> DP; 427 DP.resize(1 << N); 428 for (std::vector<int64_t> &Elmt : DP) 429 Elmt.resize(N, -1); 430 431 // Start with the entry basic block being allocated with cost zero 432 DP[1][0] = 0; 433 // Walk through TSP solutions using a bitmask to represent state (current set 434 // of BBs in the layout) 435 uint64_t BestSet = 1; 436 uint64_t BestLast = 0; 437 int64_t BestWeight = 0; 438 for (uint64_t Set = 1; Set < (1ULL << N); ++Set) { 439 // Traverse each possibility of Last BB visited in this layout 440 for (uint64_t Last = 0; Last < N; ++Last) { 441 // Case 1: There is no possible layout with this BB as Last 442 if (DP[Set][Last] == -1) 443 continue; 444 445 // Case 2: There is a layout with this Set and this Last, and we try 446 // to expand this set with New 447 for (uint64_t New = 1; New < N; ++New) { 448 // Case 2a: BB "New" is already in this Set 449 if ((Set & (1ULL << New)) != 0) 450 continue; 451 452 // Case 2b: BB "New" is not in this set and we add it to this Set and 453 // record total weight of this layout with "New" as the last BB. 454 uint64_t NewSet = (Set | (1ULL << New)); 455 if (DP[NewSet][New] == -1) 456 DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New]; 457 DP[NewSet][New] = std::max(DP[NewSet][New], 458 DP[Set][Last] + (int64_t)Weight[Last][New]); 459 460 if (DP[NewSet][New] > BestWeight) { 461 BestWeight = DP[NewSet][New]; 462 BestSet = NewSet; 463 BestLast = New; 464 } 465 } 466 } 467 } 468 469 // Define final function layout based on layout that maximizes weight 470 uint64_t Last = BestLast; 471 uint64_t Set = BestSet; 472 BitVector Visited; 473 Visited.resize(N); 474 Visited[Last] = true; 475 Order.push_back(IndexToBB[Last]); 476 Set = Set & ~(1ULL << Last); 477 while (Set != 0) { 478 int64_t Best = -1; 479 uint64_t NewLast; 480 for (uint64_t I = 0; I < N; ++I) { 481 if (DP[Set][I] == -1) 482 continue; 483 int64_t AdjWeight = Weight[I][Last] > 0 ? Weight[I][Last] : 0; 484 if (DP[Set][I] + AdjWeight > Best) { 485 NewLast = I; 486 Best = DP[Set][I] + AdjWeight; 487 } 488 } 489 Last = NewLast; 490 Visited[Last] = true; 491 Order.push_back(IndexToBB[Last]); 492 Set = Set & ~(1ULL << Last); 493 } 494 std::reverse(Order.begin(), Order.end()); 495 496 // Finalize layout with BBs that weren't assigned to the layout using the 497 // input layout. 498 for (BinaryBasicBlock *BB : BF.layout()) 499 if (Visited[BB->getLayoutIndex()] == false) 500 Order.push_back(BB); 501 } 502 503 void OptimizeReorderAlgorithm::reorderBasicBlocks( 504 const BinaryFunction &BF, BasicBlockOrder &Order) const { 505 if (BF.layout_empty()) 506 return; 507 508 // Cluster basic blocks. 509 CAlgo->clusterBasicBlocks(BF); 510 511 if (opts::PrintClusters) 512 CAlgo->printClusters(); 513 514 // Arrange basic blocks according to clusters. 515 for (ClusterAlgorithm::ClusterTy &Cluster : CAlgo->Clusters) 516 Order.insert(Order.end(), Cluster.begin(), Cluster.end()); 517 } 518 519 void OptimizeBranchReorderAlgorithm::reorderBasicBlocks( 520 const BinaryFunction &BF, BasicBlockOrder &Order) const { 521 if (BF.layout_empty()) 522 return; 523 524 // Cluster basic blocks. 525 CAlgo->clusterBasicBlocks(BF, /* ComputeEdges = */ true); 526 std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters; 527 std::vector<std::unordered_map<uint32_t, uint64_t>> &ClusterEdges = 528 CAlgo->ClusterEdges; 529 530 // Compute clusters' average frequencies. 531 CAlgo->computeClusterAverageFrequency(BF.getBinaryContext()); 532 std::vector<double> &AvgFreq = CAlgo->AvgFreq; 533 534 if (opts::PrintClusters) 535 CAlgo->printClusters(); 536 537 // Cluster layout order 538 std::vector<uint32_t> ClusterOrder; 539 540 // Do a topological sort for clusters, prioritizing frequently-executed BBs 541 // during the traversal. 542 std::stack<uint32_t> Stack; 543 std::vector<uint32_t> Status; 544 std::vector<uint32_t> Parent; 545 Status.resize(Clusters.size(), 0); 546 Parent.resize(Clusters.size(), 0); 547 constexpr uint32_t STACKED = 1; 548 constexpr uint32_t VISITED = 2; 549 Status[0] = STACKED; 550 Stack.push(0); 551 while (!Stack.empty()) { 552 uint32_t I = Stack.top(); 553 if (!(Status[I] & VISITED)) { 554 Status[I] |= VISITED; 555 // Order successors by weight 556 auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) { 557 return ClusterEdges[I][A] > ClusterEdges[I][B]; 558 }; 559 std::priority_queue<uint32_t, std::vector<uint32_t>, 560 decltype(ClusterComp)> 561 SuccQueue(ClusterComp); 562 for (std::pair<const uint32_t, uint64_t> &Target : ClusterEdges[I]) { 563 if (Target.second > 0 && !(Status[Target.first] & STACKED) && 564 !Clusters[Target.first].empty()) { 565 Parent[Target.first] = I; 566 Status[Target.first] = STACKED; 567 SuccQueue.push(Target.first); 568 } 569 } 570 while (!SuccQueue.empty()) { 571 Stack.push(SuccQueue.top()); 572 SuccQueue.pop(); 573 } 574 continue; 575 } 576 // Already visited this node 577 Stack.pop(); 578 ClusterOrder.push_back(I); 579 } 580 std::reverse(ClusterOrder.begin(), ClusterOrder.end()); 581 // Put unreachable clusters at the end 582 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) 583 if (!(Status[I] & VISITED) && !Clusters[I].empty()) 584 ClusterOrder.push_back(I); 585 586 // Sort nodes with equal precedence 587 auto Beg = ClusterOrder.begin(); 588 // Don't reorder the first cluster, which contains the function entry point 589 ++Beg; 590 std::stable_sort(Beg, ClusterOrder.end(), 591 [&AvgFreq, &Parent](uint32_t A, uint32_t B) { 592 uint32_t P = Parent[A]; 593 while (Parent[P] != 0) { 594 if (Parent[P] == B) 595 return false; 596 P = Parent[P]; 597 } 598 P = Parent[B]; 599 while (Parent[P] != 0) { 600 if (Parent[P] == A) 601 return true; 602 P = Parent[P]; 603 } 604 return AvgFreq[A] > AvgFreq[B]; 605 }); 606 607 if (opts::PrintClusters) { 608 errs() << "New cluster order: "; 609 const char *Sep = ""; 610 for (uint32_t O : ClusterOrder) { 611 errs() << Sep << O; 612 Sep = ", "; 613 } 614 errs() << '\n'; 615 } 616 617 // Arrange basic blocks according to cluster order. 618 for (uint32_t ClusterIndex : ClusterOrder) { 619 ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; 620 Order.insert(Order.end(), Cluster.begin(), Cluster.end()); 621 } 622 } 623 624 void OptimizeCacheReorderAlgorithm::reorderBasicBlocks( 625 const BinaryFunction &BF, BasicBlockOrder &Order) const { 626 if (BF.layout_empty()) 627 return; 628 629 const uint64_t ColdThreshold = 630 opts::ColdThreshold * (*BF.layout_begin())->getExecutionCount() / 1000; 631 632 // Cluster basic blocks. 633 CAlgo->clusterBasicBlocks(BF); 634 std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters; 635 636 // Compute clusters' average frequencies. 637 CAlgo->computeClusterAverageFrequency(BF.getBinaryContext()); 638 std::vector<double> &AvgFreq = CAlgo->AvgFreq; 639 640 if (opts::PrintClusters) 641 CAlgo->printClusters(); 642 643 // Cluster layout order 644 std::vector<uint32_t> ClusterOrder; 645 646 // Order clusters based on average instruction execution frequency 647 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) 648 if (!Clusters[I].empty()) 649 ClusterOrder.push_back(I); 650 // Don't reorder the first cluster, which contains the function entry point 651 std::stable_sort( 652 std::next(ClusterOrder.begin()), ClusterOrder.end(), 653 [&AvgFreq](uint32_t A, uint32_t B) { return AvgFreq[A] > AvgFreq[B]; }); 654 655 if (opts::PrintClusters) { 656 errs() << "New cluster order: "; 657 const char *Sep = ""; 658 for (uint32_t O : ClusterOrder) { 659 errs() << Sep << O; 660 Sep = ", "; 661 } 662 errs() << '\n'; 663 } 664 665 // Arrange basic blocks according to cluster order. 666 for (uint32_t ClusterIndex : ClusterOrder) { 667 ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; 668 Order.insert(Order.end(), Cluster.begin(), Cluster.end()); 669 // Force zero execution count on clusters that do not meet the cut off 670 // specified by --cold-threshold. 671 if (AvgFreq[ClusterIndex] < static_cast<double>(ColdThreshold)) 672 for (BinaryBasicBlock *BBPtr : Cluster) 673 BBPtr->setExecutionCount(0); 674 } 675 } 676 677 void ReverseReorderAlgorithm::reorderBasicBlocks(const BinaryFunction &BF, 678 BasicBlockOrder &Order) const { 679 if (BF.layout_empty()) 680 return; 681 682 BinaryBasicBlock *FirstBB = *BF.layout_begin(); 683 Order.push_back(FirstBB); 684 for (auto RLI = BF.layout_rbegin(); *RLI != FirstBB; ++RLI) 685 Order.push_back(*RLI); 686 } 687 688 void RandomClusterReorderAlgorithm::reorderBasicBlocks( 689 const BinaryFunction &BF, BasicBlockOrder &Order) const { 690 if (BF.layout_empty()) 691 return; 692 693 // Cluster basic blocks. 694 CAlgo->clusterBasicBlocks(BF); 695 std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters; 696 697 if (opts::PrintClusters) 698 CAlgo->printClusters(); 699 700 // Cluster layout order 701 std::vector<uint32_t> ClusterOrder; 702 703 // Order clusters based on average instruction execution frequency 704 for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) 705 if (!Clusters[I].empty()) 706 ClusterOrder.push_back(I); 707 708 std::shuffle(std::next(ClusterOrder.begin()), ClusterOrder.end(), 709 std::default_random_engine(opts::RandomSeed.getValue())); 710 711 if (opts::PrintClusters) { 712 errs() << "New cluster order: "; 713 const char *Sep = ""; 714 for (uint32_t O : ClusterOrder) { 715 errs() << Sep << O; 716 Sep = ", "; 717 } 718 errs() << '\n'; 719 } 720 721 // Arrange basic blocks according to cluster order. 722 for (uint32_t ClusterIndex : ClusterOrder) { 723 ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex]; 724 Order.insert(Order.end(), Cluster.begin(), Cluster.end()); 725 } 726 } 727