xref: /llvm-project/bolt/lib/Passes/ReorderAlgorithm.cpp (revision 2f09f445b2d6b3ef197aecd8d1e06d08140380f3)
1 //===- bolt/Passes/ReorderAlgorithm.cpp - Basic block reordering ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements classes used by several basic block reordering
10 // algorithms.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "bolt/Passes/ReorderAlgorithm.h"
15 #include "bolt/Core/BinaryBasicBlock.h"
16 #include "bolt/Core/BinaryFunction.h"
17 #include "llvm/Support/CommandLine.h"
18 #include <queue>
19 #include <random>
20 #include <stack>
21 
22 #undef  DEBUG_TYPE
23 #define DEBUG_TYPE "bolt"
24 
25 using namespace llvm;
26 using namespace bolt;
27 
28 namespace opts {
29 
30 extern cl::OptionCategory BoltOptCategory;
31 extern cl::opt<bool> NoThreads;
32 
33 static cl::opt<unsigned> ColdThreshold(
34     "cold-threshold",
35     cl::desc("tenths of percents of main entry frequency to use as a "
36              "threshold when evaluating whether a basic block is cold "
37              "(0 means it is only considered cold if the block has zero "
38              "samples). Default: 0 "),
39     cl::init(0), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
40 
41 static cl::opt<bool>
42 PrintClusters("print-clusters",
43   cl::desc("print clusters"),
44   cl::ZeroOrMore,
45   cl::Hidden,
46   cl::cat(BoltOptCategory));
47 
48 cl::opt<uint32_t>
49 RandomSeed("bolt-seed",
50   cl::desc("seed for randomization"),
51   cl::init(42),
52   cl::ZeroOrMore,
53   cl::Hidden,
54   cl::cat(BoltOptCategory));
55 
56 } // namespace opts
57 
58 namespace {
59 
60 template <class T> inline void hashCombine(size_t &Seed, const T &Val) {
61   std::hash<T> Hasher;
62   Seed ^= Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2);
63 }
64 
65 template <typename A, typename B> struct HashPair {
66   size_t operator()(const std::pair<A, B> &Val) const {
67     std::hash<A> Hasher;
68     size_t Seed = Hasher(Val.first);
69     hashCombine(Seed, Val.second);
70     return Seed;
71   }
72 };
73 
74 } // namespace
75 
76 void ClusterAlgorithm::computeClusterAverageFrequency(const BinaryContext &BC) {
77   // Create a separate MCCodeEmitter to allow lock-free execution
78   BinaryContext::IndependentCodeEmitter Emitter;
79   if (!opts::NoThreads) {
80     Emitter = BC.createIndependentMCCodeEmitter();
81   }
82 
83   AvgFreq.resize(Clusters.size(), 0.0);
84   for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
85     double Freq = 0.0;
86     uint64_t ClusterSize = 0;
87     for (BinaryBasicBlock *BB : Clusters[I]) {
88       if (BB->getNumNonPseudos() > 0) {
89         Freq += BB->getExecutionCount();
90         // Estimate the size of a block in bytes at run time
91         // NOTE: This might be inaccurate
92         ClusterSize += BB->estimateSize(Emitter.MCE.get());
93       }
94     }
95     AvgFreq[I] = ClusterSize == 0 ? 0 : Freq / ClusterSize;
96   }
97 }
98 
99 void ClusterAlgorithm::printClusters() const {
100   for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
101     errs() << "Cluster number " << I;
102     if (AvgFreq.size() == Clusters.size())
103       errs() << " (frequency: " << AvgFreq[I] << ")";
104     errs() << " : ";
105     const char *Sep = "";
106     for (BinaryBasicBlock *BB : Clusters[I]) {
107       errs() << Sep << BB->getName();
108       Sep = ", ";
109     }
110     errs() << "\n";
111   }
112 }
113 
114 void ClusterAlgorithm::reset() {
115   Clusters.clear();
116   ClusterEdges.clear();
117   AvgFreq.clear();
118 }
119 
120 void GreedyClusterAlgorithm::EdgeTy::print(raw_ostream &OS) const {
121   OS << Src->getName() << " -> " << Dst->getName() << ", count: " << Count;
122 }
123 
124 size_t GreedyClusterAlgorithm::EdgeHash::operator()(const EdgeTy &E) const {
125   HashPair<const BinaryBasicBlock *, const BinaryBasicBlock *> Hasher;
126   return Hasher(std::make_pair(E.Src, E.Dst));
127 }
128 
129 bool GreedyClusterAlgorithm::EdgeEqual::operator()(const EdgeTy &A,
130                                                    const EdgeTy &B) const {
131   return A.Src == B.Src && A.Dst == B.Dst;
132 }
133 
134 void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF,
135                                                 bool ComputeEdges) {
136   reset();
137 
138   // Greedy heuristic implementation for the TSP, applied to BB layout. Try to
139   // maximize weight during a path traversing all BBs. In this way, we will
140   // convert the hottest branches into fall-throughs.
141 
142   // This is the queue of edges from which we will pop edges and use them to
143   // cluster basic blocks in a greedy fashion.
144   std::vector<EdgeTy> Queue;
145 
146   // Initialize inter-cluster weights.
147   if (ComputeEdges)
148     ClusterEdges.resize(BF.layout_size());
149 
150   // Initialize clusters and edge queue.
151   for (BinaryBasicBlock *BB : BF.layout()) {
152     // Create a cluster for this BB.
153     uint32_t I = Clusters.size();
154     Clusters.emplace_back();
155     std::vector<BinaryBasicBlock *> &Cluster = Clusters.back();
156     Cluster.push_back(BB);
157     BBToClusterMap[BB] = I;
158     // Populate priority queue with edges.
159     auto BI = BB->branch_info_begin();
160     for (BinaryBasicBlock *&I : BB->successors()) {
161       assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
162              "attempted reordering blocks of function with no profile data");
163       Queue.emplace_back(EdgeTy(BB, I, BI->Count));
164       ++BI;
165     }
166   }
167   // Sort and adjust the edge queue.
168   initQueue(Queue, BF);
169 
170   // Grow clusters in a greedy fashion.
171   while (!Queue.empty()) {
172     EdgeTy E = Queue.back();
173     Queue.pop_back();
174 
175     const BinaryBasicBlock *SrcBB = E.Src;
176     const BinaryBasicBlock *DstBB = E.Dst;
177 
178     LLVM_DEBUG(dbgs() << "Popped edge "; E.print(dbgs()); dbgs() << "\n");
179 
180     // Case 1: BBSrc and BBDst are the same. Ignore this edge
181     if (SrcBB == DstBB || DstBB == *BF.layout_begin()) {
182       LLVM_DEBUG(dbgs() << "\tIgnored (same src, dst)\n");
183       continue;
184     }
185 
186     int I = BBToClusterMap[SrcBB];
187     int J = BBToClusterMap[DstBB];
188 
189     // Case 2: If they are already allocated at the same cluster, just increase
190     // the weight of this cluster
191     if (I == J) {
192       if (ComputeEdges)
193         ClusterEdges[I][I] += E.Count;
194       LLVM_DEBUG(dbgs() << "\tIgnored (src, dst belong to the same cluster)\n");
195       continue;
196     }
197 
198     std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I];
199     std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J];
200     if (areClustersCompatible(ClusterA, ClusterB, E)) {
201       // Case 3: SrcBB is at the end of a cluster and DstBB is at the start,
202       // allowing us to merge two clusters.
203       for (BinaryBasicBlock *BB : ClusterB)
204         BBToClusterMap[BB] = I;
205       ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end());
206       ClusterB.clear();
207       if (ComputeEdges) {
208         // Increase the intra-cluster edge count of cluster A with the count of
209         // this edge as well as with the total count of previously visited edges
210         // from cluster B cluster A.
211         ClusterEdges[I][I] += E.Count;
212         ClusterEdges[I][I] += ClusterEdges[J][I];
213         // Iterate through all inter-cluster edges and transfer edges targeting
214         // cluster B to cluster A.
215         for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K)
216           ClusterEdges[K][I] += ClusterEdges[K][J];
217       }
218       // Adjust the weights of the remaining edges and re-sort the queue.
219       adjustQueue(Queue, BF);
220       LLVM_DEBUG(dbgs() << "\tMerged clusters of src, dst\n");
221     } else {
222       // Case 4: Both SrcBB and DstBB are allocated in positions we cannot
223       // merge them. Add the count of this edge to the inter-cluster edge count
224       // between clusters A and B to help us decide ordering between these
225       // clusters.
226       if (ComputeEdges)
227         ClusterEdges[I][J] += E.Count;
228       LLVM_DEBUG(
229           dbgs() << "\tIgnored (src, dst belong to incompatible clusters)\n");
230     }
231   }
232 }
233 
234 void GreedyClusterAlgorithm::reset() {
235   ClusterAlgorithm::reset();
236   BBToClusterMap.clear();
237 }
238 
239 void PHGreedyClusterAlgorithm::initQueue(std::vector<EdgeTy> &Queue,
240                                          const BinaryFunction &BF) {
241   // Define a comparison function to establish SWO between edges.
242   auto Comp = [&BF](const EdgeTy &A, const EdgeTy &B) {
243     // With equal weights, prioritize branches with lower index
244     // source/destination. This helps to keep original block order for blocks
245     // when optimal order cannot be deducted from a profile.
246     if (A.Count == B.Count) {
247       const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src);
248       return (SrcOrder != 0)
249                  ? SrcOrder > 0
250                  : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0;
251     }
252     return A.Count < B.Count;
253   };
254 
255   // Sort edges in increasing profile count order.
256   std::sort(Queue.begin(), Queue.end(), Comp);
257 }
258 
259 void PHGreedyClusterAlgorithm::adjustQueue(std::vector<EdgeTy> &Queue,
260                                            const BinaryFunction &BF) {
261   // Nothing to do.
262   return;
263 }
264 
265 bool PHGreedyClusterAlgorithm::areClustersCompatible(const ClusterTy &Front,
266                                                      const ClusterTy &Back,
267                                                      const EdgeTy &E) const {
268   return Front.back() == E.Src && Back.front() == E.Dst;
269 }
270 
271 int64_t MinBranchGreedyClusterAlgorithm::calculateWeight(
272     const EdgeTy &E, const BinaryFunction &BF) const {
273   const BinaryBasicBlock *SrcBB = E.Src;
274   const BinaryBasicBlock *DstBB = E.Dst;
275 
276   // Initial weight value.
277   int64_t W = (int64_t)E.Count;
278 
279   // Adjust the weight by taking into account other edges with the same source.
280   auto BI = SrcBB->branch_info_begin();
281   for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) {
282     assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
283            "attempted reordering blocks of function with no profile data");
284     assert(BI->Count <= std::numeric_limits<int64_t>::max() &&
285            "overflow detected");
286     // Ignore edges with same source and destination, edges that target the
287     // entry block as well as the edge E itself.
288     if (SuccBB != SrcBB && SuccBB != *BF.layout_begin() && SuccBB != DstBB)
289       W -= (int64_t)BI->Count;
290     ++BI;
291   }
292 
293   // Adjust the weight by taking into account other edges with the same
294   // destination.
295   for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) {
296     // Ignore edges with same source and destination as well as the edge E
297     // itself.
298     if (PredBB == DstBB || PredBB == SrcBB)
299       continue;
300     auto BI = PredBB->branch_info_begin();
301     for (const BinaryBasicBlock *SuccBB : PredBB->successors()) {
302       if (SuccBB == DstBB)
303         break;
304       ++BI;
305     }
306     assert(BI != PredBB->branch_info_end() && "invalid control flow graph");
307     assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
308            "attempted reordering blocks of function with no profile data");
309     assert(BI->Count <= std::numeric_limits<int64_t>::max() &&
310            "overflow detected");
311     W -= (int64_t)BI->Count;
312   }
313 
314   return W;
315 }
316 
317 void MinBranchGreedyClusterAlgorithm::initQueue(std::vector<EdgeTy> &Queue,
318                                                 const BinaryFunction &BF) {
319   // Initialize edge weights.
320   for (const EdgeTy &E : Queue)
321     Weight.emplace(std::make_pair(E, calculateWeight(E, BF)));
322 
323   // Sort edges in increasing weight order.
324   adjustQueue(Queue, BF);
325 }
326 
327 void MinBranchGreedyClusterAlgorithm::adjustQueue(std::vector<EdgeTy> &Queue,
328                                                   const BinaryFunction &BF) {
329   // Define a comparison function to establish SWO between edges.
330   auto Comp = [&](const EdgeTy &A, const EdgeTy &B) {
331     // With equal weights, prioritize branches with lower index
332     // source/destination. This helps to keep original block order for blocks
333     // when optimal order cannot be deduced from a profile.
334     if (Weight[A] == Weight[B]) {
335       const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src);
336       return (SrcOrder != 0)
337                  ? SrcOrder > 0
338                  : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0;
339     }
340     return Weight[A] < Weight[B];
341   };
342 
343   // Iterate through all remaining edges to find edges that have their
344   // source and destination in the same cluster.
345   std::vector<EdgeTy> NewQueue;
346   for (const EdgeTy &E : Queue) {
347     const BinaryBasicBlock *SrcBB = E.Src;
348     const BinaryBasicBlock *DstBB = E.Dst;
349 
350     // Case 1: SrcBB and DstBB are the same or DstBB is the entry block. Ignore
351     // this edge.
352     if (SrcBB == DstBB || DstBB == *BF.layout_begin()) {
353       LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs());
354                  dbgs() << " (same src, dst)\n");
355       continue;
356     }
357 
358     int I = BBToClusterMap[SrcBB];
359     int J = BBToClusterMap[DstBB];
360     std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I];
361     std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J];
362 
363     // Case 2: They are already allocated at the same cluster or incompatible
364     // clusters. Adjust the weights of edges with the same source or
365     // destination, so that this edge has no effect on them any more, and ignore
366     // this edge. Also increase the intra- (or inter-) cluster edge count.
367     if (I == J || !areClustersCompatible(ClusterA, ClusterB, E)) {
368       if (!ClusterEdges.empty())
369         ClusterEdges[I][J] += E.Count;
370       LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs());
371                  dbgs() << " (src, dst belong to same cluster or incompatible "
372                            "clusters)\n");
373       for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) {
374         if (SuccBB == DstBB)
375           continue;
376         auto WI = Weight.find(EdgeTy(SrcBB, SuccBB, 0));
377         assert(WI != Weight.end() && "CFG edge not found in Weight map");
378         WI->second += (int64_t)E.Count;
379       }
380       for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) {
381         if (PredBB == SrcBB)
382           continue;
383         auto WI = Weight.find(EdgeTy(PredBB, DstBB, 0));
384         assert(WI != Weight.end() && "CFG edge not found in Weight map");
385         WI->second += (int64_t)E.Count;
386       }
387       continue;
388     }
389 
390     // Case 3: None of the previous cases is true, so just keep this edge in
391     // the queue.
392     NewQueue.emplace_back(E);
393   }
394 
395   // Sort remaining edges in increasing weight order.
396   Queue.swap(NewQueue);
397   std::sort(Queue.begin(), Queue.end(), Comp);
398 }
399 
400 bool MinBranchGreedyClusterAlgorithm::areClustersCompatible(
401     const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const {
402   return Front.back() == E.Src && Back.front() == E.Dst;
403 }
404 
405 void MinBranchGreedyClusterAlgorithm::reset() {
406   GreedyClusterAlgorithm::reset();
407   Weight.clear();
408 }
409 
410 void TSPReorderAlgorithm::reorderBasicBlocks(const BinaryFunction &BF,
411                                              BasicBlockOrder &Order) const {
412   std::vector<std::vector<uint64_t>> Weight;
413   std::vector<BinaryBasicBlock *> IndexToBB;
414 
415   const size_t N = BF.layout_size();
416   assert(N <= std::numeric_limits<uint64_t>::digits &&
417          "cannot use TSP solution for sizes larger than bits in uint64_t");
418 
419   // Populating weight map and index map
420   for (BinaryBasicBlock *BB : BF.layout()) {
421     BB->setLayoutIndex(IndexToBB.size());
422     IndexToBB.push_back(BB);
423   }
424   Weight.resize(N);
425   for (BinaryBasicBlock *BB : BF.layout()) {
426     auto BI = BB->branch_info_begin();
427     Weight[BB->getLayoutIndex()].resize(N);
428     for (BinaryBasicBlock *SuccBB : BB->successors()) {
429       if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE)
430         Weight[BB->getLayoutIndex()][SuccBB->getLayoutIndex()] = BI->Count;
431       ++BI;
432     }
433   }
434 
435   std::vector<std::vector<int64_t>> DP;
436   DP.resize(1 << N);
437   for (std::vector<int64_t> &Elmt : DP) {
438     Elmt.resize(N, -1);
439   }
440   // Start with the entry basic block being allocated with cost zero
441   DP[1][0] = 0;
442   // Walk through TSP solutions using a bitmask to represent state (current set
443   // of BBs in the layout)
444   uint64_t BestSet = 1;
445   uint64_t BestLast = 0;
446   int64_t BestWeight = 0;
447   for (uint64_t Set = 1; Set < (1ULL << N); ++Set) {
448     // Traverse each possibility of Last BB visited in this layout
449     for (uint64_t Last = 0; Last < N; ++Last) {
450       // Case 1: There is no possible layout with this BB as Last
451       if (DP[Set][Last] == -1)
452         continue;
453 
454       // Case 2: There is a layout with this Set and this Last, and we try
455       // to expand this set with New
456       for (uint64_t New = 1; New < N; ++New) {
457         // Case 2a: BB "New" is already in this Set
458         if ((Set & (1ULL << New)) != 0)
459           continue;
460 
461         // Case 2b: BB "New" is not in this set and we add it to this Set and
462         // record total weight of this layout with "New" as the last BB.
463         uint64_t NewSet = (Set | (1ULL << New));
464         if (DP[NewSet][New] == -1)
465           DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New];
466         DP[NewSet][New] = std::max(DP[NewSet][New],
467                                    DP[Set][Last] + (int64_t)Weight[Last][New]);
468 
469         if (DP[NewSet][New] > BestWeight) {
470           BestWeight = DP[NewSet][New];
471           BestSet = NewSet;
472           BestLast = New;
473         }
474       }
475     }
476   }
477 
478   // Define final function layout based on layout that maximizes weight
479   uint64_t Last = BestLast;
480   uint64_t Set = BestSet;
481   std::vector<bool> Visited;
482   Visited.resize(N);
483   Visited[Last] = true;
484   Order.push_back(IndexToBB[Last]);
485   Set = Set & ~(1ULL << Last);
486   while (Set != 0) {
487     int64_t Best = -1;
488     uint64_t NewLast;
489     for (uint64_t I = 0; I < N; ++I) {
490       if (DP[Set][I] == -1)
491         continue;
492       int64_t AdjWeight = Weight[I][Last] > 0 ? Weight[I][Last] : 0;
493       if (DP[Set][I] + AdjWeight > Best) {
494         NewLast = I;
495         Best = DP[Set][I] + AdjWeight;
496       }
497     }
498     Last = NewLast;
499     Visited[Last] = true;
500     Order.push_back(IndexToBB[Last]);
501     Set = Set & ~(1ULL << Last);
502   }
503   std::reverse(Order.begin(), Order.end());
504 
505   // Finalize layout with BBs that weren't assigned to the layout using the
506   // input layout.
507   for (BinaryBasicBlock *BB : BF.layout()) {
508     if (Visited[BB->getLayoutIndex()] == false)
509       Order.push_back(BB);
510   }
511 }
512 
513 void OptimizeReorderAlgorithm::reorderBasicBlocks(
514     const BinaryFunction &BF, BasicBlockOrder &Order) const {
515   if (BF.layout_empty())
516     return;
517 
518   // Cluster basic blocks.
519   CAlgo->clusterBasicBlocks(BF);
520 
521   if (opts::PrintClusters)
522     CAlgo->printClusters();
523 
524   // Arrange basic blocks according to clusters.
525   for (ClusterAlgorithm::ClusterTy &Cluster : CAlgo->Clusters)
526     Order.insert(Order.end(), Cluster.begin(), Cluster.end());
527 }
528 
529 void OptimizeBranchReorderAlgorithm::reorderBasicBlocks(
530     const BinaryFunction &BF, BasicBlockOrder &Order) const {
531   if (BF.layout_empty())
532     return;
533 
534   // Cluster basic blocks.
535   CAlgo->clusterBasicBlocks(BF, /* ComputeEdges = */ true);
536   std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
537   std::vector<std::unordered_map<uint32_t, uint64_t>> &ClusterEdges =
538       CAlgo->ClusterEdges;
539 
540   // Compute clusters' average frequencies.
541   CAlgo->computeClusterAverageFrequency(BF.getBinaryContext());
542   std::vector<double> &AvgFreq = CAlgo->AvgFreq;
543 
544   if (opts::PrintClusters)
545     CAlgo->printClusters();
546 
547   // Cluster layout order
548   std::vector<uint32_t> ClusterOrder;
549 
550   // Do a topological sort for clusters, prioritizing frequently-executed BBs
551   // during the traversal.
552   std::stack<uint32_t> Stack;
553   std::vector<uint32_t> Status;
554   std::vector<uint32_t> Parent;
555   Status.resize(Clusters.size(), 0);
556   Parent.resize(Clusters.size(), 0);
557   constexpr uint32_t STACKED = 1;
558   constexpr uint32_t VISITED = 2;
559   Status[0] = STACKED;
560   Stack.push(0);
561   while (!Stack.empty()) {
562     uint32_t I = Stack.top();
563     if (!(Status[I] & VISITED)) {
564       Status[I] |= VISITED;
565       // Order successors by weight
566       auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) {
567         return ClusterEdges[I][A] > ClusterEdges[I][B];
568       };
569       std::priority_queue<uint32_t, std::vector<uint32_t>,
570                           decltype(ClusterComp)>
571           SuccQueue(ClusterComp);
572       for (std::pair<const uint32_t, uint64_t> &Target : ClusterEdges[I]) {
573         if (Target.second > 0 && !(Status[Target.first] & STACKED) &&
574             !Clusters[Target.first].empty()) {
575           Parent[Target.first] = I;
576           Status[Target.first] = STACKED;
577           SuccQueue.push(Target.first);
578         }
579       }
580       while (!SuccQueue.empty()) {
581         Stack.push(SuccQueue.top());
582         SuccQueue.pop();
583       }
584       continue;
585     }
586     // Already visited this node
587     Stack.pop();
588     ClusterOrder.push_back(I);
589   }
590   std::reverse(ClusterOrder.begin(), ClusterOrder.end());
591   // Put unreachable clusters at the end
592   for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
593     if (!(Status[I] & VISITED) && !Clusters[I].empty())
594       ClusterOrder.push_back(I);
595 
596   // Sort nodes with equal precedence
597   auto Beg = ClusterOrder.begin();
598   // Don't reorder the first cluster, which contains the function entry point
599   ++Beg;
600   std::stable_sort(Beg, ClusterOrder.end(),
601                    [&AvgFreq, &Parent](uint32_t A, uint32_t B) {
602                      uint32_t P = Parent[A];
603                      while (Parent[P] != 0) {
604                        if (Parent[P] == B)
605                          return false;
606                        P = Parent[P];
607                      }
608                      P = Parent[B];
609                      while (Parent[P] != 0) {
610                        if (Parent[P] == A)
611                          return true;
612                        P = Parent[P];
613                      }
614                      return AvgFreq[A] > AvgFreq[B];
615                    });
616 
617   if (opts::PrintClusters) {
618     errs() << "New cluster order: ";
619     const char *Sep = "";
620     for (uint32_t O : ClusterOrder) {
621       errs() << Sep << O;
622       Sep = ", ";
623     }
624     errs() << '\n';
625   }
626 
627   // Arrange basic blocks according to cluster order.
628   for (uint32_t ClusterIndex : ClusterOrder) {
629     ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
630     Order.insert(Order.end(), Cluster.begin(), Cluster.end());
631   }
632 }
633 
634 void OptimizeCacheReorderAlgorithm::reorderBasicBlocks(
635     const BinaryFunction &BF, BasicBlockOrder &Order) const {
636   if (BF.layout_empty())
637     return;
638 
639   const uint64_t ColdThreshold =
640       opts::ColdThreshold * (*BF.layout_begin())->getExecutionCount() / 1000;
641 
642   // Cluster basic blocks.
643   CAlgo->clusterBasicBlocks(BF);
644   std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
645 
646   // Compute clusters' average frequencies.
647   CAlgo->computeClusterAverageFrequency(BF.getBinaryContext());
648   std::vector<double> &AvgFreq = CAlgo->AvgFreq;
649 
650   if (opts::PrintClusters)
651     CAlgo->printClusters();
652 
653   // Cluster layout order
654   std::vector<uint32_t> ClusterOrder;
655 
656   // Order clusters based on average instruction execution frequency
657   for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
658     if (!Clusters[I].empty())
659       ClusterOrder.push_back(I);
660   // Don't reorder the first cluster, which contains the function entry point
661   std::stable_sort(
662       std::next(ClusterOrder.begin()), ClusterOrder.end(),
663       [&AvgFreq](uint32_t A, uint32_t B) { return AvgFreq[A] > AvgFreq[B]; });
664 
665   if (opts::PrintClusters) {
666     errs() << "New cluster order: ";
667     const char *Sep = "";
668     for (uint32_t O : ClusterOrder) {
669       errs() << Sep << O;
670       Sep = ", ";
671     }
672     errs() << '\n';
673   }
674 
675   // Arrange basic blocks according to cluster order.
676   for (uint32_t ClusterIndex : ClusterOrder) {
677     ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
678     Order.insert(Order.end(), Cluster.begin(), Cluster.end());
679     // Force zero execution count on clusters that do not meet the cut off
680     // specified by --cold-threshold.
681     if (AvgFreq[ClusterIndex] < static_cast<double>(ColdThreshold)) {
682       for (BinaryBasicBlock *BBPtr : Cluster) {
683         BBPtr->setExecutionCount(0);
684       }
685     }
686   }
687 }
688 
689 void ReverseReorderAlgorithm::reorderBasicBlocks(const BinaryFunction &BF,
690                                                  BasicBlockOrder &Order) const {
691   if (BF.layout_empty())
692     return;
693 
694   BinaryBasicBlock *FirstBB = *BF.layout_begin();
695   Order.push_back(FirstBB);
696   for (auto RLI = BF.layout_rbegin(); *RLI != FirstBB; ++RLI)
697     Order.push_back(*RLI);
698 }
699 
700 void RandomClusterReorderAlgorithm::reorderBasicBlocks(
701     const BinaryFunction &BF, BasicBlockOrder &Order) const {
702   if (BF.layout_empty())
703     return;
704 
705   // Cluster basic blocks.
706   CAlgo->clusterBasicBlocks(BF);
707   std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
708 
709   if (opts::PrintClusters)
710     CAlgo->printClusters();
711 
712   // Cluster layout order
713   std::vector<uint32_t> ClusterOrder;
714 
715   // Order clusters based on average instruction execution frequency
716   for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
717     if (!Clusters[I].empty())
718       ClusterOrder.push_back(I);
719 
720   std::shuffle(std::next(ClusterOrder.begin()), ClusterOrder.end(),
721                std::default_random_engine(opts::RandomSeed.getValue()));
722 
723   if (opts::PrintClusters) {
724     errs() << "New cluster order: ";
725     const char *Sep = "";
726     for (uint32_t O : ClusterOrder) {
727       errs() << Sep << O;
728       Sep = ", ";
729     }
730     errs() << '\n';
731   }
732 
733   // Arrange basic blocks according to cluster order.
734   for (uint32_t ClusterIndex : ClusterOrder) {
735     ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
736     Order.insert(Order.end(), Cluster.begin(), Cluster.end());
737   }
738 }
739