xref: /llvm-project/bolt/lib/Passes/ReorderAlgorithm.cpp (revision b92436efcb7813fc481b30f2593a4907568d917a)
1 //===- bolt/Passes/ReorderAlgorithm.cpp - Basic block reordering ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements classes used by several basic block reordering
10 // algorithms.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "bolt/Passes/ReorderAlgorithm.h"
15 #include "bolt/Core/BinaryBasicBlock.h"
16 #include "bolt/Core/BinaryFunction.h"
17 #include "llvm/Support/CommandLine.h"
18 #include <queue>
19 #include <random>
20 #include <stack>
21 
22 #undef  DEBUG_TYPE
23 #define DEBUG_TYPE "bolt"
24 
25 using namespace llvm;
26 using namespace bolt;
27 
28 namespace opts {
29 
30 extern cl::OptionCategory BoltOptCategory;
31 extern cl::opt<bool> NoThreads;
32 
33 static cl::opt<unsigned> ColdThreshold(
34     "cold-threshold",
35     cl::desc("tenths of percents of main entry frequency to use as a "
36              "threshold when evaluating whether a basic block is cold "
37              "(0 means it is only considered cold if the block has zero "
38              "samples). Default: 0 "),
39     cl::init(0), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
40 
41 static cl::opt<bool> PrintClusters("print-clusters", cl::desc("print clusters"),
42                                    cl::Hidden, cl::cat(BoltOptCategory));
43 
44 cl::opt<uint32_t> RandomSeed("bolt-seed", cl::desc("seed for randomization"),
45                              cl::init(42), cl::Hidden,
46                              cl::cat(BoltOptCategory));
47 
48 } // namespace opts
49 
50 namespace {
51 
52 template <class T> inline void hashCombine(size_t &Seed, const T &Val) {
53   std::hash<T> Hasher;
54   Seed ^= Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2);
55 }
56 
57 template <typename A, typename B> struct HashPair {
58   size_t operator()(const std::pair<A, B> &Val) const {
59     std::hash<A> Hasher;
60     size_t Seed = Hasher(Val.first);
61     hashCombine(Seed, Val.second);
62     return Seed;
63   }
64 };
65 
66 } // namespace
67 
68 void ClusterAlgorithm::computeClusterAverageFrequency(const BinaryContext &BC) {
69   // Create a separate MCCodeEmitter to allow lock-free execution
70   BinaryContext::IndependentCodeEmitter Emitter;
71   if (!opts::NoThreads)
72     Emitter = BC.createIndependentMCCodeEmitter();
73 
74   AvgFreq.resize(Clusters.size(), 0.0);
75   for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
76     double Freq = 0.0;
77     uint64_t ClusterSize = 0;
78     for (BinaryBasicBlock *BB : Clusters[I]) {
79       if (BB->getNumNonPseudos() > 0) {
80         Freq += BB->getExecutionCount();
81         // Estimate the size of a block in bytes at run time
82         // NOTE: This might be inaccurate
83         ClusterSize += BB->estimateSize(Emitter.MCE.get());
84       }
85     }
86     AvgFreq[I] = ClusterSize == 0 ? 0 : Freq / ClusterSize;
87   }
88 }
89 
90 void ClusterAlgorithm::printClusters() const {
91   for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
92     errs() << "Cluster number " << I;
93     if (AvgFreq.size() == Clusters.size())
94       errs() << " (frequency: " << AvgFreq[I] << ")";
95     errs() << " : ";
96     const char *Sep = "";
97     for (BinaryBasicBlock *BB : Clusters[I]) {
98       errs() << Sep << BB->getName();
99       Sep = ", ";
100     }
101     errs() << "\n";
102   }
103 }
104 
105 void ClusterAlgorithm::reset() {
106   Clusters.clear();
107   ClusterEdges.clear();
108   AvgFreq.clear();
109 }
110 
111 void GreedyClusterAlgorithm::EdgeTy::print(raw_ostream &OS) const {
112   OS << Src->getName() << " -> " << Dst->getName() << ", count: " << Count;
113 }
114 
115 size_t GreedyClusterAlgorithm::EdgeHash::operator()(const EdgeTy &E) const {
116   HashPair<const BinaryBasicBlock *, const BinaryBasicBlock *> Hasher;
117   return Hasher(std::make_pair(E.Src, E.Dst));
118 }
119 
120 bool GreedyClusterAlgorithm::EdgeEqual::operator()(const EdgeTy &A,
121                                                    const EdgeTy &B) const {
122   return A.Src == B.Src && A.Dst == B.Dst;
123 }
124 
125 void GreedyClusterAlgorithm::clusterBasicBlocks(const BinaryFunction &BF,
126                                                 bool ComputeEdges) {
127   reset();
128 
129   // Greedy heuristic implementation for the TSP, applied to BB layout. Try to
130   // maximize weight during a path traversing all BBs. In this way, we will
131   // convert the hottest branches into fall-throughs.
132 
133   // This is the queue of edges from which we will pop edges and use them to
134   // cluster basic blocks in a greedy fashion.
135   std::vector<EdgeTy> Queue;
136 
137   // Initialize inter-cluster weights.
138   if (ComputeEdges)
139     ClusterEdges.resize(BF.layout_size());
140 
141   // Initialize clusters and edge queue.
142   for (BinaryBasicBlock *BB : BF.layout()) {
143     // Create a cluster for this BB.
144     uint32_t I = Clusters.size();
145     Clusters.emplace_back();
146     std::vector<BinaryBasicBlock *> &Cluster = Clusters.back();
147     Cluster.push_back(BB);
148     BBToClusterMap[BB] = I;
149     // Populate priority queue with edges.
150     auto BI = BB->branch_info_begin();
151     for (BinaryBasicBlock *&I : BB->successors()) {
152       assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
153              "attempted reordering blocks of function with no profile data");
154       Queue.emplace_back(EdgeTy(BB, I, BI->Count));
155       ++BI;
156     }
157   }
158   // Sort and adjust the edge queue.
159   initQueue(Queue, BF);
160 
161   // Grow clusters in a greedy fashion.
162   while (!Queue.empty()) {
163     EdgeTy E = Queue.back();
164     Queue.pop_back();
165 
166     const BinaryBasicBlock *SrcBB = E.Src;
167     const BinaryBasicBlock *DstBB = E.Dst;
168 
169     LLVM_DEBUG(dbgs() << "Popped edge "; E.print(dbgs()); dbgs() << "\n");
170 
171     // Case 1: BBSrc and BBDst are the same. Ignore this edge
172     if (SrcBB == DstBB || DstBB == *BF.layout_begin()) {
173       LLVM_DEBUG(dbgs() << "\tIgnored (same src, dst)\n");
174       continue;
175     }
176 
177     int I = BBToClusterMap[SrcBB];
178     int J = BBToClusterMap[DstBB];
179 
180     // Case 2: If they are already allocated at the same cluster, just increase
181     // the weight of this cluster
182     if (I == J) {
183       if (ComputeEdges)
184         ClusterEdges[I][I] += E.Count;
185       LLVM_DEBUG(dbgs() << "\tIgnored (src, dst belong to the same cluster)\n");
186       continue;
187     }
188 
189     std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I];
190     std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J];
191     if (areClustersCompatible(ClusterA, ClusterB, E)) {
192       // Case 3: SrcBB is at the end of a cluster and DstBB is at the start,
193       // allowing us to merge two clusters.
194       for (BinaryBasicBlock *BB : ClusterB)
195         BBToClusterMap[BB] = I;
196       ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end());
197       ClusterB.clear();
198       if (ComputeEdges) {
199         // Increase the intra-cluster edge count of cluster A with the count of
200         // this edge as well as with the total count of previously visited edges
201         // from cluster B cluster A.
202         ClusterEdges[I][I] += E.Count;
203         ClusterEdges[I][I] += ClusterEdges[J][I];
204         // Iterate through all inter-cluster edges and transfer edges targeting
205         // cluster B to cluster A.
206         for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K)
207           ClusterEdges[K][I] += ClusterEdges[K][J];
208       }
209       // Adjust the weights of the remaining edges and re-sort the queue.
210       adjustQueue(Queue, BF);
211       LLVM_DEBUG(dbgs() << "\tMerged clusters of src, dst\n");
212     } else {
213       // Case 4: Both SrcBB and DstBB are allocated in positions we cannot
214       // merge them. Add the count of this edge to the inter-cluster edge count
215       // between clusters A and B to help us decide ordering between these
216       // clusters.
217       if (ComputeEdges)
218         ClusterEdges[I][J] += E.Count;
219       LLVM_DEBUG(
220           dbgs() << "\tIgnored (src, dst belong to incompatible clusters)\n");
221     }
222   }
223 }
224 
225 void GreedyClusterAlgorithm::reset() {
226   ClusterAlgorithm::reset();
227   BBToClusterMap.clear();
228 }
229 
230 void PHGreedyClusterAlgorithm::initQueue(std::vector<EdgeTy> &Queue,
231                                          const BinaryFunction &BF) {
232   // Define a comparison function to establish SWO between edges.
233   auto Comp = [&BF](const EdgeTy &A, const EdgeTy &B) {
234     // With equal weights, prioritize branches with lower index
235     // source/destination. This helps to keep original block order for blocks
236     // when optimal order cannot be deducted from a profile.
237     if (A.Count == B.Count) {
238       const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src);
239       return (SrcOrder != 0)
240                  ? SrcOrder > 0
241                  : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0;
242     }
243     return A.Count < B.Count;
244   };
245 
246   // Sort edges in increasing profile count order.
247   std::sort(Queue.begin(), Queue.end(), Comp);
248 }
249 
250 void PHGreedyClusterAlgorithm::adjustQueue(std::vector<EdgeTy> &Queue,
251                                            const BinaryFunction &BF) {
252   // Nothing to do.
253   return;
254 }
255 
256 bool PHGreedyClusterAlgorithm::areClustersCompatible(const ClusterTy &Front,
257                                                      const ClusterTy &Back,
258                                                      const EdgeTy &E) const {
259   return Front.back() == E.Src && Back.front() == E.Dst;
260 }
261 
262 int64_t MinBranchGreedyClusterAlgorithm::calculateWeight(
263     const EdgeTy &E, const BinaryFunction &BF) const {
264   const BinaryBasicBlock *SrcBB = E.Src;
265   const BinaryBasicBlock *DstBB = E.Dst;
266 
267   // Initial weight value.
268   int64_t W = (int64_t)E.Count;
269 
270   // Adjust the weight by taking into account other edges with the same source.
271   auto BI = SrcBB->branch_info_begin();
272   for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) {
273     assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
274            "attempted reordering blocks of function with no profile data");
275     assert(BI->Count <= std::numeric_limits<int64_t>::max() &&
276            "overflow detected");
277     // Ignore edges with same source and destination, edges that target the
278     // entry block as well as the edge E itself.
279     if (SuccBB != SrcBB && SuccBB != *BF.layout_begin() && SuccBB != DstBB)
280       W -= (int64_t)BI->Count;
281     ++BI;
282   }
283 
284   // Adjust the weight by taking into account other edges with the same
285   // destination.
286   for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) {
287     // Ignore edges with same source and destination as well as the edge E
288     // itself.
289     if (PredBB == DstBB || PredBB == SrcBB)
290       continue;
291     auto BI = PredBB->branch_info_begin();
292     for (const BinaryBasicBlock *SuccBB : PredBB->successors()) {
293       if (SuccBB == DstBB)
294         break;
295       ++BI;
296     }
297     assert(BI != PredBB->branch_info_end() && "invalid control flow graph");
298     assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
299            "attempted reordering blocks of function with no profile data");
300     assert(BI->Count <= std::numeric_limits<int64_t>::max() &&
301            "overflow detected");
302     W -= (int64_t)BI->Count;
303   }
304 
305   return W;
306 }
307 
308 void MinBranchGreedyClusterAlgorithm::initQueue(std::vector<EdgeTy> &Queue,
309                                                 const BinaryFunction &BF) {
310   // Initialize edge weights.
311   for (const EdgeTy &E : Queue)
312     Weight.emplace(std::make_pair(E, calculateWeight(E, BF)));
313 
314   // Sort edges in increasing weight order.
315   adjustQueue(Queue, BF);
316 }
317 
318 void MinBranchGreedyClusterAlgorithm::adjustQueue(std::vector<EdgeTy> &Queue,
319                                                   const BinaryFunction &BF) {
320   // Define a comparison function to establish SWO between edges.
321   auto Comp = [&](const EdgeTy &A, const EdgeTy &B) {
322     // With equal weights, prioritize branches with lower index
323     // source/destination. This helps to keep original block order for blocks
324     // when optimal order cannot be deduced from a profile.
325     if (Weight[A] == Weight[B]) {
326       const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src);
327       return (SrcOrder != 0)
328                  ? SrcOrder > 0
329                  : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0;
330     }
331     return Weight[A] < Weight[B];
332   };
333 
334   // Iterate through all remaining edges to find edges that have their
335   // source and destination in the same cluster.
336   std::vector<EdgeTy> NewQueue;
337   for (const EdgeTy &E : Queue) {
338     const BinaryBasicBlock *SrcBB = E.Src;
339     const BinaryBasicBlock *DstBB = E.Dst;
340 
341     // Case 1: SrcBB and DstBB are the same or DstBB is the entry block. Ignore
342     // this edge.
343     if (SrcBB == DstBB || DstBB == *BF.layout_begin()) {
344       LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs());
345                  dbgs() << " (same src, dst)\n");
346       continue;
347     }
348 
349     int I = BBToClusterMap[SrcBB];
350     int J = BBToClusterMap[DstBB];
351     std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I];
352     std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J];
353 
354     // Case 2: They are already allocated at the same cluster or incompatible
355     // clusters. Adjust the weights of edges with the same source or
356     // destination, so that this edge has no effect on them any more, and ignore
357     // this edge. Also increase the intra- (or inter-) cluster edge count.
358     if (I == J || !areClustersCompatible(ClusterA, ClusterB, E)) {
359       if (!ClusterEdges.empty())
360         ClusterEdges[I][J] += E.Count;
361       LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs());
362                  dbgs() << " (src, dst belong to same cluster or incompatible "
363                            "clusters)\n");
364       for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) {
365         if (SuccBB == DstBB)
366           continue;
367         auto WI = Weight.find(EdgeTy(SrcBB, SuccBB, 0));
368         assert(WI != Weight.end() && "CFG edge not found in Weight map");
369         WI->second += (int64_t)E.Count;
370       }
371       for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) {
372         if (PredBB == SrcBB)
373           continue;
374         auto WI = Weight.find(EdgeTy(PredBB, DstBB, 0));
375         assert(WI != Weight.end() && "CFG edge not found in Weight map");
376         WI->second += (int64_t)E.Count;
377       }
378       continue;
379     }
380 
381     // Case 3: None of the previous cases is true, so just keep this edge in
382     // the queue.
383     NewQueue.emplace_back(E);
384   }
385 
386   // Sort remaining edges in increasing weight order.
387   Queue.swap(NewQueue);
388   std::sort(Queue.begin(), Queue.end(), Comp);
389 }
390 
391 bool MinBranchGreedyClusterAlgorithm::areClustersCompatible(
392     const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const {
393   return Front.back() == E.Src && Back.front() == E.Dst;
394 }
395 
396 void MinBranchGreedyClusterAlgorithm::reset() {
397   GreedyClusterAlgorithm::reset();
398   Weight.clear();
399 }
400 
401 void TSPReorderAlgorithm::reorderBasicBlocks(const BinaryFunction &BF,
402                                              BasicBlockOrder &Order) const {
403   std::vector<std::vector<uint64_t>> Weight;
404   std::vector<BinaryBasicBlock *> IndexToBB;
405 
406   const size_t N = BF.layout_size();
407   assert(N <= std::numeric_limits<uint64_t>::digits &&
408          "cannot use TSP solution for sizes larger than bits in uint64_t");
409 
410   // Populating weight map and index map
411   for (BinaryBasicBlock *BB : BF.layout()) {
412     BB->setLayoutIndex(IndexToBB.size());
413     IndexToBB.push_back(BB);
414   }
415   Weight.resize(N);
416   for (BinaryBasicBlock *BB : BF.layout()) {
417     auto BI = BB->branch_info_begin();
418     Weight[BB->getLayoutIndex()].resize(N);
419     for (BinaryBasicBlock *SuccBB : BB->successors()) {
420       if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE)
421         Weight[BB->getLayoutIndex()][SuccBB->getLayoutIndex()] = BI->Count;
422       ++BI;
423     }
424   }
425 
426   std::vector<std::vector<int64_t>> DP;
427   DP.resize(1 << N);
428   for (std::vector<int64_t> &Elmt : DP)
429     Elmt.resize(N, -1);
430 
431   // Start with the entry basic block being allocated with cost zero
432   DP[1][0] = 0;
433   // Walk through TSP solutions using a bitmask to represent state (current set
434   // of BBs in the layout)
435   uint64_t BestSet = 1;
436   uint64_t BestLast = 0;
437   int64_t BestWeight = 0;
438   for (uint64_t Set = 1; Set < (1ULL << N); ++Set) {
439     // Traverse each possibility of Last BB visited in this layout
440     for (uint64_t Last = 0; Last < N; ++Last) {
441       // Case 1: There is no possible layout with this BB as Last
442       if (DP[Set][Last] == -1)
443         continue;
444 
445       // Case 2: There is a layout with this Set and this Last, and we try
446       // to expand this set with New
447       for (uint64_t New = 1; New < N; ++New) {
448         // Case 2a: BB "New" is already in this Set
449         if ((Set & (1ULL << New)) != 0)
450           continue;
451 
452         // Case 2b: BB "New" is not in this set and we add it to this Set and
453         // record total weight of this layout with "New" as the last BB.
454         uint64_t NewSet = (Set | (1ULL << New));
455         if (DP[NewSet][New] == -1)
456           DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New];
457         DP[NewSet][New] = std::max(DP[NewSet][New],
458                                    DP[Set][Last] + (int64_t)Weight[Last][New]);
459 
460         if (DP[NewSet][New] > BestWeight) {
461           BestWeight = DP[NewSet][New];
462           BestSet = NewSet;
463           BestLast = New;
464         }
465       }
466     }
467   }
468 
469   // Define final function layout based on layout that maximizes weight
470   uint64_t Last = BestLast;
471   uint64_t Set = BestSet;
472   BitVector Visited;
473   Visited.resize(N);
474   Visited[Last] = true;
475   Order.push_back(IndexToBB[Last]);
476   Set = Set & ~(1ULL << Last);
477   while (Set != 0) {
478     int64_t Best = -1;
479     uint64_t NewLast;
480     for (uint64_t I = 0; I < N; ++I) {
481       if (DP[Set][I] == -1)
482         continue;
483       int64_t AdjWeight = Weight[I][Last] > 0 ? Weight[I][Last] : 0;
484       if (DP[Set][I] + AdjWeight > Best) {
485         NewLast = I;
486         Best = DP[Set][I] + AdjWeight;
487       }
488     }
489     Last = NewLast;
490     Visited[Last] = true;
491     Order.push_back(IndexToBB[Last]);
492     Set = Set & ~(1ULL << Last);
493   }
494   std::reverse(Order.begin(), Order.end());
495 
496   // Finalize layout with BBs that weren't assigned to the layout using the
497   // input layout.
498   for (BinaryBasicBlock *BB : BF.layout())
499     if (Visited[BB->getLayoutIndex()] == false)
500       Order.push_back(BB);
501 }
502 
503 void OptimizeReorderAlgorithm::reorderBasicBlocks(
504     const BinaryFunction &BF, BasicBlockOrder &Order) const {
505   if (BF.layout_empty())
506     return;
507 
508   // Cluster basic blocks.
509   CAlgo->clusterBasicBlocks(BF);
510 
511   if (opts::PrintClusters)
512     CAlgo->printClusters();
513 
514   // Arrange basic blocks according to clusters.
515   for (ClusterAlgorithm::ClusterTy &Cluster : CAlgo->Clusters)
516     Order.insert(Order.end(), Cluster.begin(), Cluster.end());
517 }
518 
519 void OptimizeBranchReorderAlgorithm::reorderBasicBlocks(
520     const BinaryFunction &BF, BasicBlockOrder &Order) const {
521   if (BF.layout_empty())
522     return;
523 
524   // Cluster basic blocks.
525   CAlgo->clusterBasicBlocks(BF, /* ComputeEdges = */ true);
526   std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
527   std::vector<std::unordered_map<uint32_t, uint64_t>> &ClusterEdges =
528       CAlgo->ClusterEdges;
529 
530   // Compute clusters' average frequencies.
531   CAlgo->computeClusterAverageFrequency(BF.getBinaryContext());
532   std::vector<double> &AvgFreq = CAlgo->AvgFreq;
533 
534   if (opts::PrintClusters)
535     CAlgo->printClusters();
536 
537   // Cluster layout order
538   std::vector<uint32_t> ClusterOrder;
539 
540   // Do a topological sort for clusters, prioritizing frequently-executed BBs
541   // during the traversal.
542   std::stack<uint32_t> Stack;
543   std::vector<uint32_t> Status;
544   std::vector<uint32_t> Parent;
545   Status.resize(Clusters.size(), 0);
546   Parent.resize(Clusters.size(), 0);
547   constexpr uint32_t STACKED = 1;
548   constexpr uint32_t VISITED = 2;
549   Status[0] = STACKED;
550   Stack.push(0);
551   while (!Stack.empty()) {
552     uint32_t I = Stack.top();
553     if (!(Status[I] & VISITED)) {
554       Status[I] |= VISITED;
555       // Order successors by weight
556       auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) {
557         return ClusterEdges[I][A] > ClusterEdges[I][B];
558       };
559       std::priority_queue<uint32_t, std::vector<uint32_t>,
560                           decltype(ClusterComp)>
561           SuccQueue(ClusterComp);
562       for (std::pair<const uint32_t, uint64_t> &Target : ClusterEdges[I]) {
563         if (Target.second > 0 && !(Status[Target.first] & STACKED) &&
564             !Clusters[Target.first].empty()) {
565           Parent[Target.first] = I;
566           Status[Target.first] = STACKED;
567           SuccQueue.push(Target.first);
568         }
569       }
570       while (!SuccQueue.empty()) {
571         Stack.push(SuccQueue.top());
572         SuccQueue.pop();
573       }
574       continue;
575     }
576     // Already visited this node
577     Stack.pop();
578     ClusterOrder.push_back(I);
579   }
580   std::reverse(ClusterOrder.begin(), ClusterOrder.end());
581   // Put unreachable clusters at the end
582   for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
583     if (!(Status[I] & VISITED) && !Clusters[I].empty())
584       ClusterOrder.push_back(I);
585 
586   // Sort nodes with equal precedence
587   auto Beg = ClusterOrder.begin();
588   // Don't reorder the first cluster, which contains the function entry point
589   ++Beg;
590   std::stable_sort(Beg, ClusterOrder.end(),
591                    [&AvgFreq, &Parent](uint32_t A, uint32_t B) {
592                      uint32_t P = Parent[A];
593                      while (Parent[P] != 0) {
594                        if (Parent[P] == B)
595                          return false;
596                        P = Parent[P];
597                      }
598                      P = Parent[B];
599                      while (Parent[P] != 0) {
600                        if (Parent[P] == A)
601                          return true;
602                        P = Parent[P];
603                      }
604                      return AvgFreq[A] > AvgFreq[B];
605                    });
606 
607   if (opts::PrintClusters) {
608     errs() << "New cluster order: ";
609     const char *Sep = "";
610     for (uint32_t O : ClusterOrder) {
611       errs() << Sep << O;
612       Sep = ", ";
613     }
614     errs() << '\n';
615   }
616 
617   // Arrange basic blocks according to cluster order.
618   for (uint32_t ClusterIndex : ClusterOrder) {
619     ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
620     Order.insert(Order.end(), Cluster.begin(), Cluster.end());
621   }
622 }
623 
624 void OptimizeCacheReorderAlgorithm::reorderBasicBlocks(
625     const BinaryFunction &BF, BasicBlockOrder &Order) const {
626   if (BF.layout_empty())
627     return;
628 
629   const uint64_t ColdThreshold =
630       opts::ColdThreshold * (*BF.layout_begin())->getExecutionCount() / 1000;
631 
632   // Cluster basic blocks.
633   CAlgo->clusterBasicBlocks(BF);
634   std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
635 
636   // Compute clusters' average frequencies.
637   CAlgo->computeClusterAverageFrequency(BF.getBinaryContext());
638   std::vector<double> &AvgFreq = CAlgo->AvgFreq;
639 
640   if (opts::PrintClusters)
641     CAlgo->printClusters();
642 
643   // Cluster layout order
644   std::vector<uint32_t> ClusterOrder;
645 
646   // Order clusters based on average instruction execution frequency
647   for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
648     if (!Clusters[I].empty())
649       ClusterOrder.push_back(I);
650   // Don't reorder the first cluster, which contains the function entry point
651   std::stable_sort(
652       std::next(ClusterOrder.begin()), ClusterOrder.end(),
653       [&AvgFreq](uint32_t A, uint32_t B) { return AvgFreq[A] > AvgFreq[B]; });
654 
655   if (opts::PrintClusters) {
656     errs() << "New cluster order: ";
657     const char *Sep = "";
658     for (uint32_t O : ClusterOrder) {
659       errs() << Sep << O;
660       Sep = ", ";
661     }
662     errs() << '\n';
663   }
664 
665   // Arrange basic blocks according to cluster order.
666   for (uint32_t ClusterIndex : ClusterOrder) {
667     ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
668     Order.insert(Order.end(), Cluster.begin(), Cluster.end());
669     // Force zero execution count on clusters that do not meet the cut off
670     // specified by --cold-threshold.
671     if (AvgFreq[ClusterIndex] < static_cast<double>(ColdThreshold))
672       for (BinaryBasicBlock *BBPtr : Cluster)
673         BBPtr->setExecutionCount(0);
674   }
675 }
676 
677 void ReverseReorderAlgorithm::reorderBasicBlocks(const BinaryFunction &BF,
678                                                  BasicBlockOrder &Order) const {
679   if (BF.layout_empty())
680     return;
681 
682   BinaryBasicBlock *FirstBB = *BF.layout_begin();
683   Order.push_back(FirstBB);
684   for (auto RLI = BF.layout_rbegin(); *RLI != FirstBB; ++RLI)
685     Order.push_back(*RLI);
686 }
687 
688 void RandomClusterReorderAlgorithm::reorderBasicBlocks(
689     const BinaryFunction &BF, BasicBlockOrder &Order) const {
690   if (BF.layout_empty())
691     return;
692 
693   // Cluster basic blocks.
694   CAlgo->clusterBasicBlocks(BF);
695   std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
696 
697   if (opts::PrintClusters)
698     CAlgo->printClusters();
699 
700   // Cluster layout order
701   std::vector<uint32_t> ClusterOrder;
702 
703   // Order clusters based on average instruction execution frequency
704   for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
705     if (!Clusters[I].empty())
706       ClusterOrder.push_back(I);
707 
708   std::shuffle(std::next(ClusterOrder.begin()), ClusterOrder.end(),
709                std::default_random_engine(opts::RandomSeed.getValue()));
710 
711   if (opts::PrintClusters) {
712     errs() << "New cluster order: ";
713     const char *Sep = "";
714     for (uint32_t O : ClusterOrder) {
715       errs() << Sep << O;
716       Sep = ", ";
717     }
718     errs() << '\n';
719   }
720 
721   // Arrange basic blocks according to cluster order.
722   for (uint32_t ClusterIndex : ClusterOrder) {
723     ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
724     Order.insert(Order.end(), Cluster.begin(), Cluster.end());
725   }
726 }
727