xref: /llvm-project/bolt/lib/Passes/ReorderAlgorithm.cpp (revision f92ab6af35343df1f82c3a85feb75d41b6be382e)
1 //===- bolt/Passes/ReorderAlgorithm.cpp - Basic block reordering ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements classes used by several basic block reordering
10 // algorithms.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "bolt/Passes/ReorderAlgorithm.h"
15 #include "bolt/Core/BinaryBasicBlock.h"
16 #include "bolt/Core/BinaryFunction.h"
17 #include "llvm/Support/CommandLine.h"
18 #include <queue>
19 #include <random>
20 #include <stack>
21 
22 #undef  DEBUG_TYPE
23 #define DEBUG_TYPE "bolt"
24 
25 using namespace llvm;
26 using namespace bolt;
27 
28 namespace opts {
29 
30 extern cl::OptionCategory BoltOptCategory;
31 extern cl::opt<bool> NoThreads;
32 
33 static cl::opt<unsigned> ColdThreshold(
34     "cold-threshold",
35     cl::desc("tenths of percents of main entry frequency to use as a "
36              "threshold when evaluating whether a basic block is cold "
37              "(0 means it is only considered cold if the block has zero "
38              "samples). Default: 0 "),
39     cl::init(0), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
40 
41 static cl::opt<bool>
42 PrintClusters("print-clusters",
43   cl::desc("print clusters"),
44   cl::ZeroOrMore,
45   cl::Hidden,
46   cl::cat(BoltOptCategory));
47 
48 cl::opt<uint32_t>
49 RandomSeed("bolt-seed",
50   cl::desc("seed for randomization"),
51   cl::init(42),
52   cl::ZeroOrMore,
53   cl::Hidden,
54   cl::cat(BoltOptCategory));
55 
56 } // namespace opts
57 
58 namespace {
59 
60 template <class T> inline void hashCombine(size_t &Seed, const T &Val) {
61   std::hash<T> Hasher;
62   Seed ^= Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2);
63 }
64 
65 template <typename A, typename B> struct HashPair {
66   size_t operator()(const std::pair<A, B> &Val) const {
67     std::hash<A> Hasher;
68     size_t Seed = Hasher(Val.first);
69     hashCombine(Seed, Val.second);
70     return Seed;
71   }
72 };
73 
74 } // namespace
75 
76 void ClusterAlgorithm::computeClusterAverageFrequency(const BinaryContext &BC) {
77   // Create a separate MCCodeEmitter to allow lock-free execution
78   BinaryContext::IndependentCodeEmitter Emitter;
79   if (!opts::NoThreads)
80     Emitter = BC.createIndependentMCCodeEmitter();
81 
82   AvgFreq.resize(Clusters.size(), 0.0);
83   for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
84     double Freq = 0.0;
85     uint64_t ClusterSize = 0;
86     for (BinaryBasicBlock *BB : Clusters[I]) {
87       if (BB->getNumNonPseudos() > 0) {
88         Freq += BB->getExecutionCount();
89         // Estimate the size of a block in bytes at run time
90         // NOTE: This might be inaccurate
91         ClusterSize += BB->estimateSize(Emitter.MCE.get());
92       }
93     }
94     AvgFreq[I] = ClusterSize == 0 ? 0 : Freq / ClusterSize;
95   }
96 }
97 
98 void ClusterAlgorithm::printClusters() const {
99   for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
100     errs() << "Cluster number " << I;
101     if (AvgFreq.size() == Clusters.size())
102       errs() << " (frequency: " << AvgFreq[I] << ")";
103     errs() << " : ";
104     const char *Sep = "";
105     for (BinaryBasicBlock *BB : Clusters[I]) {
106       errs() << Sep << BB->getName();
107       Sep = ", ";
108     }
109     errs() << "\n";
110   }
111 }
112 
113 void ClusterAlgorithm::reset() {
114   Clusters.clear();
115   ClusterEdges.clear();
116   AvgFreq.clear();
117 }
118 
119 void GreedyClusterAlgorithm::EdgeTy::print(raw_ostream &OS) const {
120   OS << Src->getName() << " -> " << Dst->getName() << ", count: " << Count;
121 }
122 
123 size_t GreedyClusterAlgorithm::EdgeHash::operator()(const EdgeTy &E) const {
124   HashPair<const BinaryBasicBlock *, const BinaryBasicBlock *> Hasher;
125   return Hasher(std::make_pair(E.Src, E.Dst));
126 }
127 
128 bool GreedyClusterAlgorithm::EdgeEqual::operator()(const EdgeTy &A,
129                                                    const EdgeTy &B) const {
130   return A.Src == B.Src && A.Dst == B.Dst;
131 }
132 
133 void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF,
134                                                 bool ComputeEdges) {
135   reset();
136 
137   // Greedy heuristic implementation for the TSP, applied to BB layout. Try to
138   // maximize weight during a path traversing all BBs. In this way, we will
139   // convert the hottest branches into fall-throughs.
140 
141   // This is the queue of edges from which we will pop edges and use them to
142   // cluster basic blocks in a greedy fashion.
143   std::vector<EdgeTy> Queue;
144 
145   // Initialize inter-cluster weights.
146   if (ComputeEdges)
147     ClusterEdges.resize(BF.layout_size());
148 
149   // Initialize clusters and edge queue.
150   for (BinaryBasicBlock *BB : BF.layout()) {
151     // Create a cluster for this BB.
152     uint32_t I = Clusters.size();
153     Clusters.emplace_back();
154     std::vector<BinaryBasicBlock *> &Cluster = Clusters.back();
155     Cluster.push_back(BB);
156     BBToClusterMap[BB] = I;
157     // Populate priority queue with edges.
158     auto BI = BB->branch_info_begin();
159     for (BinaryBasicBlock *&I : BB->successors()) {
160       assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
161              "attempted reordering blocks of function with no profile data");
162       Queue.emplace_back(EdgeTy(BB, I, BI->Count));
163       ++BI;
164     }
165   }
166   // Sort and adjust the edge queue.
167   initQueue(Queue, BF);
168 
169   // Grow clusters in a greedy fashion.
170   while (!Queue.empty()) {
171     EdgeTy E = Queue.back();
172     Queue.pop_back();
173 
174     const BinaryBasicBlock *SrcBB = E.Src;
175     const BinaryBasicBlock *DstBB = E.Dst;
176 
177     LLVM_DEBUG(dbgs() << "Popped edge "; E.print(dbgs()); dbgs() << "\n");
178 
179     // Case 1: BBSrc and BBDst are the same. Ignore this edge
180     if (SrcBB == DstBB || DstBB == *BF.layout_begin()) {
181       LLVM_DEBUG(dbgs() << "\tIgnored (same src, dst)\n");
182       continue;
183     }
184 
185     int I = BBToClusterMap[SrcBB];
186     int J = BBToClusterMap[DstBB];
187 
188     // Case 2: If they are already allocated at the same cluster, just increase
189     // the weight of this cluster
190     if (I == J) {
191       if (ComputeEdges)
192         ClusterEdges[I][I] += E.Count;
193       LLVM_DEBUG(dbgs() << "\tIgnored (src, dst belong to the same cluster)\n");
194       continue;
195     }
196 
197     std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I];
198     std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J];
199     if (areClustersCompatible(ClusterA, ClusterB, E)) {
200       // Case 3: SrcBB is at the end of a cluster and DstBB is at the start,
201       // allowing us to merge two clusters.
202       for (BinaryBasicBlock *BB : ClusterB)
203         BBToClusterMap[BB] = I;
204       ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end());
205       ClusterB.clear();
206       if (ComputeEdges) {
207         // Increase the intra-cluster edge count of cluster A with the count of
208         // this edge as well as with the total count of previously visited edges
209         // from cluster B cluster A.
210         ClusterEdges[I][I] += E.Count;
211         ClusterEdges[I][I] += ClusterEdges[J][I];
212         // Iterate through all inter-cluster edges and transfer edges targeting
213         // cluster B to cluster A.
214         for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K)
215           ClusterEdges[K][I] += ClusterEdges[K][J];
216       }
217       // Adjust the weights of the remaining edges and re-sort the queue.
218       adjustQueue(Queue, BF);
219       LLVM_DEBUG(dbgs() << "\tMerged clusters of src, dst\n");
220     } else {
221       // Case 4: Both SrcBB and DstBB are allocated in positions we cannot
222       // merge them. Add the count of this edge to the inter-cluster edge count
223       // between clusters A and B to help us decide ordering between these
224       // clusters.
225       if (ComputeEdges)
226         ClusterEdges[I][J] += E.Count;
227       LLVM_DEBUG(
228           dbgs() << "\tIgnored (src, dst belong to incompatible clusters)\n");
229     }
230   }
231 }
232 
233 void GreedyClusterAlgorithm::reset() {
234   ClusterAlgorithm::reset();
235   BBToClusterMap.clear();
236 }
237 
238 void PHGreedyClusterAlgorithm::initQueue(std::vector<EdgeTy> &Queue,
239                                          const BinaryFunction &BF) {
240   // Define a comparison function to establish SWO between edges.
241   auto Comp = [&BF](const EdgeTy &A, const EdgeTy &B) {
242     // With equal weights, prioritize branches with lower index
243     // source/destination. This helps to keep original block order for blocks
244     // when optimal order cannot be deducted from a profile.
245     if (A.Count == B.Count) {
246       const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src);
247       return (SrcOrder != 0)
248                  ? SrcOrder > 0
249                  : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0;
250     }
251     return A.Count < B.Count;
252   };
253 
254   // Sort edges in increasing profile count order.
255   std::sort(Queue.begin(), Queue.end(), Comp);
256 }
257 
258 void PHGreedyClusterAlgorithm::adjustQueue(std::vector<EdgeTy> &Queue,
259                                            const BinaryFunction &BF) {
260   // Nothing to do.
261   return;
262 }
263 
264 bool PHGreedyClusterAlgorithm::areClustersCompatible(const ClusterTy &Front,
265                                                      const ClusterTy &Back,
266                                                      const EdgeTy &E) const {
267   return Front.back() == E.Src && Back.front() == E.Dst;
268 }
269 
270 int64_t MinBranchGreedyClusterAlgorithm::calculateWeight(
271     const EdgeTy &E, const BinaryFunction &BF) const {
272   const BinaryBasicBlock *SrcBB = E.Src;
273   const BinaryBasicBlock *DstBB = E.Dst;
274 
275   // Initial weight value.
276   int64_t W = (int64_t)E.Count;
277 
278   // Adjust the weight by taking into account other edges with the same source.
279   auto BI = SrcBB->branch_info_begin();
280   for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) {
281     assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
282            "attempted reordering blocks of function with no profile data");
283     assert(BI->Count <= std::numeric_limits<int64_t>::max() &&
284            "overflow detected");
285     // Ignore edges with same source and destination, edges that target the
286     // entry block as well as the edge E itself.
287     if (SuccBB != SrcBB && SuccBB != *BF.layout_begin() && SuccBB != DstBB)
288       W -= (int64_t)BI->Count;
289     ++BI;
290   }
291 
292   // Adjust the weight by taking into account other edges with the same
293   // destination.
294   for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) {
295     // Ignore edges with same source and destination as well as the edge E
296     // itself.
297     if (PredBB == DstBB || PredBB == SrcBB)
298       continue;
299     auto BI = PredBB->branch_info_begin();
300     for (const BinaryBasicBlock *SuccBB : PredBB->successors()) {
301       if (SuccBB == DstBB)
302         break;
303       ++BI;
304     }
305     assert(BI != PredBB->branch_info_end() && "invalid control flow graph");
306     assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
307            "attempted reordering blocks of function with no profile data");
308     assert(BI->Count <= std::numeric_limits<int64_t>::max() &&
309            "overflow detected");
310     W -= (int64_t)BI->Count;
311   }
312 
313   return W;
314 }
315 
316 void MinBranchGreedyClusterAlgorithm::initQueue(std::vector<EdgeTy> &Queue,
317                                                 const BinaryFunction &BF) {
318   // Initialize edge weights.
319   for (const EdgeTy &E : Queue)
320     Weight.emplace(std::make_pair(E, calculateWeight(E, BF)));
321 
322   // Sort edges in increasing weight order.
323   adjustQueue(Queue, BF);
324 }
325 
326 void MinBranchGreedyClusterAlgorithm::adjustQueue(std::vector<EdgeTy> &Queue,
327                                                   const BinaryFunction &BF) {
328   // Define a comparison function to establish SWO between edges.
329   auto Comp = [&](const EdgeTy &A, const EdgeTy &B) {
330     // With equal weights, prioritize branches with lower index
331     // source/destination. This helps to keep original block order for blocks
332     // when optimal order cannot be deduced from a profile.
333     if (Weight[A] == Weight[B]) {
334       const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src);
335       return (SrcOrder != 0)
336                  ? SrcOrder > 0
337                  : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0;
338     }
339     return Weight[A] < Weight[B];
340   };
341 
342   // Iterate through all remaining edges to find edges that have their
343   // source and destination in the same cluster.
344   std::vector<EdgeTy> NewQueue;
345   for (const EdgeTy &E : Queue) {
346     const BinaryBasicBlock *SrcBB = E.Src;
347     const BinaryBasicBlock *DstBB = E.Dst;
348 
349     // Case 1: SrcBB and DstBB are the same or DstBB is the entry block. Ignore
350     // this edge.
351     if (SrcBB == DstBB || DstBB == *BF.layout_begin()) {
352       LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs());
353                  dbgs() << " (same src, dst)\n");
354       continue;
355     }
356 
357     int I = BBToClusterMap[SrcBB];
358     int J = BBToClusterMap[DstBB];
359     std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I];
360     std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J];
361 
362     // Case 2: They are already allocated at the same cluster or incompatible
363     // clusters. Adjust the weights of edges with the same source or
364     // destination, so that this edge has no effect on them any more, and ignore
365     // this edge. Also increase the intra- (or inter-) cluster edge count.
366     if (I == J || !areClustersCompatible(ClusterA, ClusterB, E)) {
367       if (!ClusterEdges.empty())
368         ClusterEdges[I][J] += E.Count;
369       LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs());
370                  dbgs() << " (src, dst belong to same cluster or incompatible "
371                            "clusters)\n");
372       for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) {
373         if (SuccBB == DstBB)
374           continue;
375         auto WI = Weight.find(EdgeTy(SrcBB, SuccBB, 0));
376         assert(WI != Weight.end() && "CFG edge not found in Weight map");
377         WI->second += (int64_t)E.Count;
378       }
379       for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) {
380         if (PredBB == SrcBB)
381           continue;
382         auto WI = Weight.find(EdgeTy(PredBB, DstBB, 0));
383         assert(WI != Weight.end() && "CFG edge not found in Weight map");
384         WI->second += (int64_t)E.Count;
385       }
386       continue;
387     }
388 
389     // Case 3: None of the previous cases is true, so just keep this edge in
390     // the queue.
391     NewQueue.emplace_back(E);
392   }
393 
394   // Sort remaining edges in increasing weight order.
395   Queue.swap(NewQueue);
396   std::sort(Queue.begin(), Queue.end(), Comp);
397 }
398 
399 bool MinBranchGreedyClusterAlgorithm::areClustersCompatible(
400     const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const {
401   return Front.back() == E.Src && Back.front() == E.Dst;
402 }
403 
404 void MinBranchGreedyClusterAlgorithm::reset() {
405   GreedyClusterAlgorithm::reset();
406   Weight.clear();
407 }
408 
409 void TSPReorderAlgorithm::reorderBasicBlocks(const BinaryFunction &BF,
410                                              BasicBlockOrder &Order) const {
411   std::vector<std::vector<uint64_t>> Weight;
412   std::vector<BinaryBasicBlock *> IndexToBB;
413 
414   const size_t N = BF.layout_size();
415   assert(N <= std::numeric_limits<uint64_t>::digits &&
416          "cannot use TSP solution for sizes larger than bits in uint64_t");
417 
418   // Populating weight map and index map
419   for (BinaryBasicBlock *BB : BF.layout()) {
420     BB->setLayoutIndex(IndexToBB.size());
421     IndexToBB.push_back(BB);
422   }
423   Weight.resize(N);
424   for (BinaryBasicBlock *BB : BF.layout()) {
425     auto BI = BB->branch_info_begin();
426     Weight[BB->getLayoutIndex()].resize(N);
427     for (BinaryBasicBlock *SuccBB : BB->successors()) {
428       if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE)
429         Weight[BB->getLayoutIndex()][SuccBB->getLayoutIndex()] = BI->Count;
430       ++BI;
431     }
432   }
433 
434   std::vector<std::vector<int64_t>> DP;
435   DP.resize(1 << N);
436   for (std::vector<int64_t> &Elmt : DP)
437     Elmt.resize(N, -1);
438 
439   // Start with the entry basic block being allocated with cost zero
440   DP[1][0] = 0;
441   // Walk through TSP solutions using a bitmask to represent state (current set
442   // of BBs in the layout)
443   uint64_t BestSet = 1;
444   uint64_t BestLast = 0;
445   int64_t BestWeight = 0;
446   for (uint64_t Set = 1; Set < (1ULL << N); ++Set) {
447     // Traverse each possibility of Last BB visited in this layout
448     for (uint64_t Last = 0; Last < N; ++Last) {
449       // Case 1: There is no possible layout with this BB as Last
450       if (DP[Set][Last] == -1)
451         continue;
452 
453       // Case 2: There is a layout with this Set and this Last, and we try
454       // to expand this set with New
455       for (uint64_t New = 1; New < N; ++New) {
456         // Case 2a: BB "New" is already in this Set
457         if ((Set & (1ULL << New)) != 0)
458           continue;
459 
460         // Case 2b: BB "New" is not in this set and we add it to this Set and
461         // record total weight of this layout with "New" as the last BB.
462         uint64_t NewSet = (Set | (1ULL << New));
463         if (DP[NewSet][New] == -1)
464           DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New];
465         DP[NewSet][New] = std::max(DP[NewSet][New],
466                                    DP[Set][Last] + (int64_t)Weight[Last][New]);
467 
468         if (DP[NewSet][New] > BestWeight) {
469           BestWeight = DP[NewSet][New];
470           BestSet = NewSet;
471           BestLast = New;
472         }
473       }
474     }
475   }
476 
477   // Define final function layout based on layout that maximizes weight
478   uint64_t Last = BestLast;
479   uint64_t Set = BestSet;
480   std::vector<bool> Visited;
481   Visited.resize(N);
482   Visited[Last] = true;
483   Order.push_back(IndexToBB[Last]);
484   Set = Set & ~(1ULL << Last);
485   while (Set != 0) {
486     int64_t Best = -1;
487     uint64_t NewLast;
488     for (uint64_t I = 0; I < N; ++I) {
489       if (DP[Set][I] == -1)
490         continue;
491       int64_t AdjWeight = Weight[I][Last] > 0 ? Weight[I][Last] : 0;
492       if (DP[Set][I] + AdjWeight > Best) {
493         NewLast = I;
494         Best = DP[Set][I] + AdjWeight;
495       }
496     }
497     Last = NewLast;
498     Visited[Last] = true;
499     Order.push_back(IndexToBB[Last]);
500     Set = Set & ~(1ULL << Last);
501   }
502   std::reverse(Order.begin(), Order.end());
503 
504   // Finalize layout with BBs that weren't assigned to the layout using the
505   // input layout.
506   for (BinaryBasicBlock *BB : BF.layout())
507     if (Visited[BB->getLayoutIndex()] == false)
508       Order.push_back(BB);
509 }
510 
511 void OptimizeReorderAlgorithm::reorderBasicBlocks(
512     const BinaryFunction &BF, BasicBlockOrder &Order) const {
513   if (BF.layout_empty())
514     return;
515 
516   // Cluster basic blocks.
517   CAlgo->clusterBasicBlocks(BF);
518 
519   if (opts::PrintClusters)
520     CAlgo->printClusters();
521 
522   // Arrange basic blocks according to clusters.
523   for (ClusterAlgorithm::ClusterTy &Cluster : CAlgo->Clusters)
524     Order.insert(Order.end(), Cluster.begin(), Cluster.end());
525 }
526 
527 void OptimizeBranchReorderAlgorithm::reorderBasicBlocks(
528     const BinaryFunction &BF, BasicBlockOrder &Order) const {
529   if (BF.layout_empty())
530     return;
531 
532   // Cluster basic blocks.
533   CAlgo->clusterBasicBlocks(BF, /* ComputeEdges = */ true);
534   std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
535   std::vector<std::unordered_map<uint32_t, uint64_t>> &ClusterEdges =
536       CAlgo->ClusterEdges;
537 
538   // Compute clusters' average frequencies.
539   CAlgo->computeClusterAverageFrequency(BF.getBinaryContext());
540   std::vector<double> &AvgFreq = CAlgo->AvgFreq;
541 
542   if (opts::PrintClusters)
543     CAlgo->printClusters();
544 
545   // Cluster layout order
546   std::vector<uint32_t> ClusterOrder;
547 
548   // Do a topological sort for clusters, prioritizing frequently-executed BBs
549   // during the traversal.
550   std::stack<uint32_t> Stack;
551   std::vector<uint32_t> Status;
552   std::vector<uint32_t> Parent;
553   Status.resize(Clusters.size(), 0);
554   Parent.resize(Clusters.size(), 0);
555   constexpr uint32_t STACKED = 1;
556   constexpr uint32_t VISITED = 2;
557   Status[0] = STACKED;
558   Stack.push(0);
559   while (!Stack.empty()) {
560     uint32_t I = Stack.top();
561     if (!(Status[I] & VISITED)) {
562       Status[I] |= VISITED;
563       // Order successors by weight
564       auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) {
565         return ClusterEdges[I][A] > ClusterEdges[I][B];
566       };
567       std::priority_queue<uint32_t, std::vector<uint32_t>,
568                           decltype(ClusterComp)>
569           SuccQueue(ClusterComp);
570       for (std::pair<const uint32_t, uint64_t> &Target : ClusterEdges[I]) {
571         if (Target.second > 0 && !(Status[Target.first] & STACKED) &&
572             !Clusters[Target.first].empty()) {
573           Parent[Target.first] = I;
574           Status[Target.first] = STACKED;
575           SuccQueue.push(Target.first);
576         }
577       }
578       while (!SuccQueue.empty()) {
579         Stack.push(SuccQueue.top());
580         SuccQueue.pop();
581       }
582       continue;
583     }
584     // Already visited this node
585     Stack.pop();
586     ClusterOrder.push_back(I);
587   }
588   std::reverse(ClusterOrder.begin(), ClusterOrder.end());
589   // Put unreachable clusters at the end
590   for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
591     if (!(Status[I] & VISITED) && !Clusters[I].empty())
592       ClusterOrder.push_back(I);
593 
594   // Sort nodes with equal precedence
595   auto Beg = ClusterOrder.begin();
596   // Don't reorder the first cluster, which contains the function entry point
597   ++Beg;
598   std::stable_sort(Beg, ClusterOrder.end(),
599                    [&AvgFreq, &Parent](uint32_t A, uint32_t B) {
600                      uint32_t P = Parent[A];
601                      while (Parent[P] != 0) {
602                        if (Parent[P] == B)
603                          return false;
604                        P = Parent[P];
605                      }
606                      P = Parent[B];
607                      while (Parent[P] != 0) {
608                        if (Parent[P] == A)
609                          return true;
610                        P = Parent[P];
611                      }
612                      return AvgFreq[A] > AvgFreq[B];
613                    });
614 
615   if (opts::PrintClusters) {
616     errs() << "New cluster order: ";
617     const char *Sep = "";
618     for (uint32_t O : ClusterOrder) {
619       errs() << Sep << O;
620       Sep = ", ";
621     }
622     errs() << '\n';
623   }
624 
625   // Arrange basic blocks according to cluster order.
626   for (uint32_t ClusterIndex : ClusterOrder) {
627     ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
628     Order.insert(Order.end(), Cluster.begin(), Cluster.end());
629   }
630 }
631 
632 void OptimizeCacheReorderAlgorithm::reorderBasicBlocks(
633     const BinaryFunction &BF, BasicBlockOrder &Order) const {
634   if (BF.layout_empty())
635     return;
636 
637   const uint64_t ColdThreshold =
638       opts::ColdThreshold * (*BF.layout_begin())->getExecutionCount() / 1000;
639 
640   // Cluster basic blocks.
641   CAlgo->clusterBasicBlocks(BF);
642   std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
643 
644   // Compute clusters' average frequencies.
645   CAlgo->computeClusterAverageFrequency(BF.getBinaryContext());
646   std::vector<double> &AvgFreq = CAlgo->AvgFreq;
647 
648   if (opts::PrintClusters)
649     CAlgo->printClusters();
650 
651   // Cluster layout order
652   std::vector<uint32_t> ClusterOrder;
653 
654   // Order clusters based on average instruction execution frequency
655   for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
656     if (!Clusters[I].empty())
657       ClusterOrder.push_back(I);
658   // Don't reorder the first cluster, which contains the function entry point
659   std::stable_sort(
660       std::next(ClusterOrder.begin()), ClusterOrder.end(),
661       [&AvgFreq](uint32_t A, uint32_t B) { return AvgFreq[A] > AvgFreq[B]; });
662 
663   if (opts::PrintClusters) {
664     errs() << "New cluster order: ";
665     const char *Sep = "";
666     for (uint32_t O : ClusterOrder) {
667       errs() << Sep << O;
668       Sep = ", ";
669     }
670     errs() << '\n';
671   }
672 
673   // Arrange basic blocks according to cluster order.
674   for (uint32_t ClusterIndex : ClusterOrder) {
675     ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
676     Order.insert(Order.end(), Cluster.begin(), Cluster.end());
677     // Force zero execution count on clusters that do not meet the cut off
678     // specified by --cold-threshold.
679     if (AvgFreq[ClusterIndex] < static_cast<double>(ColdThreshold))
680       for (BinaryBasicBlock *BBPtr : Cluster)
681         BBPtr->setExecutionCount(0);
682   }
683 }
684 
685 void ReverseReorderAlgorithm::reorderBasicBlocks(const BinaryFunction &BF,
686                                                  BasicBlockOrder &Order) const {
687   if (BF.layout_empty())
688     return;
689 
690   BinaryBasicBlock *FirstBB = *BF.layout_begin();
691   Order.push_back(FirstBB);
692   for (auto RLI = BF.layout_rbegin(); *RLI != FirstBB; ++RLI)
693     Order.push_back(*RLI);
694 }
695 
696 void RandomClusterReorderAlgorithm::reorderBasicBlocks(
697     const BinaryFunction &BF, BasicBlockOrder &Order) const {
698   if (BF.layout_empty())
699     return;
700 
701   // Cluster basic blocks.
702   CAlgo->clusterBasicBlocks(BF);
703   std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
704 
705   if (opts::PrintClusters)
706     CAlgo->printClusters();
707 
708   // Cluster layout order
709   std::vector<uint32_t> ClusterOrder;
710 
711   // Order clusters based on average instruction execution frequency
712   for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
713     if (!Clusters[I].empty())
714       ClusterOrder.push_back(I);
715 
716   std::shuffle(std::next(ClusterOrder.begin()), ClusterOrder.end(),
717                std::default_random_engine(opts::RandomSeed.getValue()));
718 
719   if (opts::PrintClusters) {
720     errs() << "New cluster order: ";
721     const char *Sep = "";
722     for (uint32_t O : ClusterOrder) {
723       errs() << Sep << O;
724       Sep = ", ";
725     }
726     errs() << '\n';
727   }
728 
729   // Arrange basic blocks according to cluster order.
730   for (uint32_t ClusterIndex : ClusterOrder) {
731     ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
732     Order.insert(Order.end(), Cluster.begin(), Cluster.end());
733   }
734 }
735