xref: /llvm-project/bolt/lib/Passes/ReorderAlgorithm.cpp (revision fa5486e487a6f98358eca572efb4b1fd7d27d1f5)
1 //===- bolt/Passes/ReorderAlgorithm.cpp - Basic block reordering ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements classes used by several basic block reordering
10 // algorithms.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "bolt/Passes/ReorderAlgorithm.h"
15 #include "bolt/Core/BinaryBasicBlock.h"
16 #include "bolt/Core/BinaryFunction.h"
17 #include "llvm/Support/CommandLine.h"
18 #include "llvm/Transforms/Utils/CodeLayout.h"
19 #include <queue>
20 #include <random>
21 #include <stack>
22 
23 #undef DEBUG_TYPE
24 #define DEBUG_TYPE "bolt"
25 
26 using namespace llvm;
27 using namespace bolt;
28 
29 namespace opts {
30 
31 extern cl::OptionCategory BoltOptCategory;
32 extern cl::opt<bool> NoThreads;
33 
34 static cl::opt<unsigned> ColdThreshold(
35     "cold-threshold",
36     cl::desc("tenths of percents of main entry frequency to use as a "
37              "threshold when evaluating whether a basic block is cold "
38              "(0 means it is only considered cold if the block has zero "
39              "samples). Default: 0 "),
40     cl::init(0), cl::ZeroOrMore, cl::Hidden, cl::cat(BoltOptCategory));
41 
42 static cl::opt<bool> PrintClusters("print-clusters", cl::desc("print clusters"),
43                                    cl::Hidden, cl::cat(BoltOptCategory));
44 
45 cl::opt<uint32_t> RandomSeed("bolt-seed", cl::desc("seed for randomization"),
46                              cl::init(42), cl::Hidden,
47                              cl::cat(BoltOptCategory));
48 
49 } // namespace opts
50 
51 namespace {
52 
hashCombine(size_t & Seed,const T & Val)53 template <class T> inline void hashCombine(size_t &Seed, const T &Val) {
54   std::hash<T> Hasher;
55   Seed ^= Hasher(Val) + 0x9e3779b9 + (Seed << 6) + (Seed >> 2);
56 }
57 
58 template <typename A, typename B> struct HashPair {
operator ()__anon9cbc67f10111::HashPair59   size_t operator()(const std::pair<A, B> &Val) const {
60     std::hash<A> Hasher;
61     size_t Seed = Hasher(Val.first);
62     hashCombine(Seed, Val.second);
63     return Seed;
64   }
65 };
66 
67 } // namespace
68 
computeClusterAverageFrequency(const BinaryContext & BC)69 void ClusterAlgorithm::computeClusterAverageFrequency(const BinaryContext &BC) {
70   // Create a separate MCCodeEmitter to allow lock-free execution
71   BinaryContext::IndependentCodeEmitter Emitter;
72   if (!opts::NoThreads)
73     Emitter = BC.createIndependentMCCodeEmitter();
74 
75   AvgFreq.resize(Clusters.size(), 0.0);
76   for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
77     double Freq = 0.0;
78     uint64_t ClusterSize = 0;
79     for (const BinaryBasicBlock *BB : Clusters[I]) {
80       if (BB->getNumNonPseudos() > 0) {
81         Freq += BB->getExecutionCount();
82         // Estimate the size of a block in bytes at run time
83         // NOTE: This might be inaccurate
84         ClusterSize += BB->estimateSize(Emitter.MCE.get());
85       }
86     }
87     AvgFreq[I] = ClusterSize == 0 ? 0 : Freq / ClusterSize;
88   }
89 }
90 
printClusters() const91 void ClusterAlgorithm::printClusters() const {
92   for (uint32_t I = 0, E = Clusters.size(); I < E; ++I) {
93     errs() << "Cluster number " << I;
94     if (AvgFreq.size() == Clusters.size())
95       errs() << " (frequency: " << AvgFreq[I] << ")";
96     errs() << " : ";
97     const char *Sep = "";
98     for (const BinaryBasicBlock *BB : Clusters[I]) {
99       errs() << Sep << BB->getName();
100       Sep = ", ";
101     }
102     errs() << "\n";
103   }
104 }
105 
reset()106 void ClusterAlgorithm::reset() {
107   Clusters.clear();
108   ClusterEdges.clear();
109   AvgFreq.clear();
110 }
111 
print(raw_ostream & OS) const112 void GreedyClusterAlgorithm::EdgeTy::print(raw_ostream &OS) const {
113   OS << Src->getName() << " -> " << Dst->getName() << ", count: " << Count;
114 }
115 
operator ()(const EdgeTy & E) const116 size_t GreedyClusterAlgorithm::EdgeHash::operator()(const EdgeTy &E) const {
117   HashPair<const BinaryBasicBlock *, const BinaryBasicBlock *> Hasher;
118   return Hasher(std::make_pair(E.Src, E.Dst));
119 }
120 
operator ()(const EdgeTy & A,const EdgeTy & B) const121 bool GreedyClusterAlgorithm::EdgeEqual::operator()(const EdgeTy &A,
122                                                    const EdgeTy &B) const {
123   return A.Src == B.Src && A.Dst == B.Dst;
124 }
125 
clusterBasicBlocks(BinaryFunction & BF,bool ComputeEdges)126 void GreedyClusterAlgorithm::clusterBasicBlocks(BinaryFunction &BF,
127                                                 bool ComputeEdges) {
128   reset();
129 
130   // Greedy heuristic implementation for the TSP, applied to BB layout. Try to
131   // maximize weight during a path traversing all BBs. In this way, we will
132   // convert the hottest branches into fall-throughs.
133 
134   // This is the queue of edges from which we will pop edges and use them to
135   // cluster basic blocks in a greedy fashion.
136   std::vector<EdgeTy> Queue;
137 
138   // Initialize inter-cluster weights.
139   if (ComputeEdges)
140     ClusterEdges.resize(BF.getLayout().block_size());
141 
142   // Initialize clusters and edge queue.
143   for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
144     // Create a cluster for this BB.
145     uint32_t I = Clusters.size();
146     Clusters.emplace_back();
147     std::vector<BinaryBasicBlock *> &Cluster = Clusters.back();
148     Cluster.push_back(BB);
149     BBToClusterMap[BB] = I;
150     // Populate priority queue with edges.
151     auto BI = BB->branch_info_begin();
152     for (const BinaryBasicBlock *I : BB->successors()) {
153       assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
154              "attempted reordering blocks of function with no profile data");
155       Queue.emplace_back(EdgeTy(BB, I, BI->Count));
156       ++BI;
157     }
158   }
159   // Sort and adjust the edge queue.
160   initQueue(Queue, BF);
161 
162   // Grow clusters in a greedy fashion.
163   while (!Queue.empty()) {
164     EdgeTy E = Queue.back();
165     Queue.pop_back();
166 
167     const BinaryBasicBlock *SrcBB = E.Src;
168     const BinaryBasicBlock *DstBB = E.Dst;
169 
170     LLVM_DEBUG(dbgs() << "Popped edge "; E.print(dbgs()); dbgs() << "\n");
171 
172     // Case 1: BBSrc and BBDst are the same. Ignore this edge
173     if (SrcBB == DstBB || DstBB == *BF.getLayout().block_begin()) {
174       LLVM_DEBUG(dbgs() << "\tIgnored (same src, dst)\n");
175       continue;
176     }
177 
178     int I = BBToClusterMap[SrcBB];
179     int J = BBToClusterMap[DstBB];
180 
181     // Case 2: If they are already allocated at the same cluster, just increase
182     // the weight of this cluster
183     if (I == J) {
184       if (ComputeEdges)
185         ClusterEdges[I][I] += E.Count;
186       LLVM_DEBUG(dbgs() << "\tIgnored (src, dst belong to the same cluster)\n");
187       continue;
188     }
189 
190     std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I];
191     std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J];
192     if (areClustersCompatible(ClusterA, ClusterB, E)) {
193       // Case 3: SrcBB is at the end of a cluster and DstBB is at the start,
194       // allowing us to merge two clusters.
195       for (const BinaryBasicBlock *BB : ClusterB)
196         BBToClusterMap[BB] = I;
197       ClusterA.insert(ClusterA.end(), ClusterB.begin(), ClusterB.end());
198       ClusterB.clear();
199       if (ComputeEdges) {
200         // Increase the intra-cluster edge count of cluster A with the count of
201         // this edge as well as with the total count of previously visited edges
202         // from cluster B cluster A.
203         ClusterEdges[I][I] += E.Count;
204         ClusterEdges[I][I] += ClusterEdges[J][I];
205         // Iterate through all inter-cluster edges and transfer edges targeting
206         // cluster B to cluster A.
207         for (uint32_t K = 0, E = ClusterEdges.size(); K != E; ++K)
208           ClusterEdges[K][I] += ClusterEdges[K][J];
209       }
210       // Adjust the weights of the remaining edges and re-sort the queue.
211       adjustQueue(Queue, BF);
212       LLVM_DEBUG(dbgs() << "\tMerged clusters of src, dst\n");
213     } else {
214       // Case 4: Both SrcBB and DstBB are allocated in positions we cannot
215       // merge them. Add the count of this edge to the inter-cluster edge count
216       // between clusters A and B to help us decide ordering between these
217       // clusters.
218       if (ComputeEdges)
219         ClusterEdges[I][J] += E.Count;
220       LLVM_DEBUG(
221           dbgs() << "\tIgnored (src, dst belong to incompatible clusters)\n");
222     }
223   }
224 }
225 
reset()226 void GreedyClusterAlgorithm::reset() {
227   ClusterAlgorithm::reset();
228   BBToClusterMap.clear();
229 }
230 
initQueue(std::vector<EdgeTy> & Queue,const BinaryFunction & BF)231 void PHGreedyClusterAlgorithm::initQueue(std::vector<EdgeTy> &Queue,
232                                          const BinaryFunction &BF) {
233   // Define a comparison function to establish SWO between edges.
234   auto Comp = [&BF](const EdgeTy &A, const EdgeTy &B) {
235     // With equal weights, prioritize branches with lower index
236     // source/destination. This helps to keep original block order for blocks
237     // when optimal order cannot be deducted from a profile.
238     if (A.Count == B.Count) {
239       const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src);
240       return (SrcOrder != 0)
241                  ? SrcOrder > 0
242                  : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0;
243     }
244     return A.Count < B.Count;
245   };
246 
247   // Sort edges in increasing profile count order.
248   llvm::sort(Queue, Comp);
249 }
250 
adjustQueue(std::vector<EdgeTy> & Queue,const BinaryFunction & BF)251 void PHGreedyClusterAlgorithm::adjustQueue(std::vector<EdgeTy> &Queue,
252                                            const BinaryFunction &BF) {
253   // Nothing to do.
254 }
255 
areClustersCompatible(const ClusterTy & Front,const ClusterTy & Back,const EdgeTy & E) const256 bool PHGreedyClusterAlgorithm::areClustersCompatible(const ClusterTy &Front,
257                                                      const ClusterTy &Back,
258                                                      const EdgeTy &E) const {
259   return Front.back() == E.Src && Back.front() == E.Dst;
260 }
261 
calculateWeight(const EdgeTy & E,const BinaryFunction & BF) const262 int64_t MinBranchGreedyClusterAlgorithm::calculateWeight(
263     const EdgeTy &E, const BinaryFunction &BF) const {
264   const BinaryBasicBlock *SrcBB = E.Src;
265   const BinaryBasicBlock *DstBB = E.Dst;
266 
267   // Initial weight value.
268   int64_t W = (int64_t)E.Count;
269 
270   // Adjust the weight by taking into account other edges with the same source.
271   auto BI = SrcBB->branch_info_begin();
272   for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) {
273     assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
274            "attempted reordering blocks of function with no profile data");
275     assert(BI->Count <= std::numeric_limits<int64_t>::max() &&
276            "overflow detected");
277     // Ignore edges with same source and destination, edges that target the
278     // entry block as well as the edge E itself.
279     if (SuccBB != SrcBB && SuccBB != *BF.getLayout().block_begin() &&
280         SuccBB != DstBB)
281       W -= (int64_t)BI->Count;
282     ++BI;
283   }
284 
285   // Adjust the weight by taking into account other edges with the same
286   // destination.
287   for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) {
288     // Ignore edges with same source and destination as well as the edge E
289     // itself.
290     if (PredBB == DstBB || PredBB == SrcBB)
291       continue;
292     auto BI = PredBB->branch_info_begin();
293     for (const BinaryBasicBlock *SuccBB : PredBB->successors()) {
294       if (SuccBB == DstBB)
295         break;
296       ++BI;
297     }
298     assert(BI != PredBB->branch_info_end() && "invalid control flow graph");
299     assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
300            "attempted reordering blocks of function with no profile data");
301     assert(BI->Count <= std::numeric_limits<int64_t>::max() &&
302            "overflow detected");
303     W -= (int64_t)BI->Count;
304   }
305 
306   return W;
307 }
308 
initQueue(std::vector<EdgeTy> & Queue,const BinaryFunction & BF)309 void MinBranchGreedyClusterAlgorithm::initQueue(std::vector<EdgeTy> &Queue,
310                                                 const BinaryFunction &BF) {
311   // Initialize edge weights.
312   for (const EdgeTy &E : Queue)
313     Weight.emplace(std::make_pair(E, calculateWeight(E, BF)));
314 
315   // Sort edges in increasing weight order.
316   adjustQueue(Queue, BF);
317 }
318 
adjustQueue(std::vector<EdgeTy> & Queue,const BinaryFunction & BF)319 void MinBranchGreedyClusterAlgorithm::adjustQueue(std::vector<EdgeTy> &Queue,
320                                                   const BinaryFunction &BF) {
321   // Define a comparison function to establish SWO between edges.
322   auto Comp = [&](const EdgeTy &A, const EdgeTy &B) {
323     // With equal weights, prioritize branches with lower index
324     // source/destination. This helps to keep original block order for blocks
325     // when optimal order cannot be deduced from a profile.
326     if (Weight[A] == Weight[B]) {
327       const signed SrcOrder = BF.getOriginalLayoutRelativeOrder(A.Src, B.Src);
328       return (SrcOrder != 0)
329                  ? SrcOrder > 0
330                  : BF.getOriginalLayoutRelativeOrder(A.Dst, B.Dst) > 0;
331     }
332     return Weight[A] < Weight[B];
333   };
334 
335   // Iterate through all remaining edges to find edges that have their
336   // source and destination in the same cluster.
337   std::vector<EdgeTy> NewQueue;
338   for (const EdgeTy &E : Queue) {
339     const BinaryBasicBlock *SrcBB = E.Src;
340     const BinaryBasicBlock *DstBB = E.Dst;
341 
342     // Case 1: SrcBB and DstBB are the same or DstBB is the entry block. Ignore
343     // this edge.
344     if (SrcBB == DstBB || DstBB == *BF.getLayout().block_begin()) {
345       LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs());
346                  dbgs() << " (same src, dst)\n");
347       continue;
348     }
349 
350     int I = BBToClusterMap[SrcBB];
351     int J = BBToClusterMap[DstBB];
352     std::vector<BinaryBasicBlock *> &ClusterA = Clusters[I];
353     std::vector<BinaryBasicBlock *> &ClusterB = Clusters[J];
354 
355     // Case 2: They are already allocated at the same cluster or incompatible
356     // clusters. Adjust the weights of edges with the same source or
357     // destination, so that this edge has no effect on them any more, and ignore
358     // this edge. Also increase the intra- (or inter-) cluster edge count.
359     if (I == J || !areClustersCompatible(ClusterA, ClusterB, E)) {
360       if (!ClusterEdges.empty())
361         ClusterEdges[I][J] += E.Count;
362       LLVM_DEBUG(dbgs() << "\tAdjustment: Ignored edge "; E.print(dbgs());
363                  dbgs() << " (src, dst belong to same cluster or incompatible "
364                            "clusters)\n");
365       for (const BinaryBasicBlock *SuccBB : SrcBB->successors()) {
366         if (SuccBB == DstBB)
367           continue;
368         auto WI = Weight.find(EdgeTy(SrcBB, SuccBB, 0));
369         assert(WI != Weight.end() && "CFG edge not found in Weight map");
370         WI->second += (int64_t)E.Count;
371       }
372       for (const BinaryBasicBlock *PredBB : DstBB->predecessors()) {
373         if (PredBB == SrcBB)
374           continue;
375         auto WI = Weight.find(EdgeTy(PredBB, DstBB, 0));
376         assert(WI != Weight.end() && "CFG edge not found in Weight map");
377         WI->second += (int64_t)E.Count;
378       }
379       continue;
380     }
381 
382     // Case 3: None of the previous cases is true, so just keep this edge in
383     // the queue.
384     NewQueue.emplace_back(E);
385   }
386 
387   // Sort remaining edges in increasing weight order.
388   Queue.swap(NewQueue);
389   llvm::sort(Queue, Comp);
390 }
391 
areClustersCompatible(const ClusterTy & Front,const ClusterTy & Back,const EdgeTy & E) const392 bool MinBranchGreedyClusterAlgorithm::areClustersCompatible(
393     const ClusterTy &Front, const ClusterTy &Back, const EdgeTy &E) const {
394   return Front.back() == E.Src && Back.front() == E.Dst;
395 }
396 
reset()397 void MinBranchGreedyClusterAlgorithm::reset() {
398   GreedyClusterAlgorithm::reset();
399   Weight.clear();
400 }
401 
reorderBasicBlocks(BinaryFunction & BF,BasicBlockOrder & Order) const402 void TSPReorderAlgorithm::reorderBasicBlocks(BinaryFunction &BF,
403                                              BasicBlockOrder &Order) const {
404   std::vector<std::vector<uint64_t>> Weight;
405   std::vector<BinaryBasicBlock *> IndexToBB;
406 
407   const size_t N = BF.getLayout().block_size();
408   assert(N <= std::numeric_limits<uint64_t>::digits &&
409          "cannot use TSP solution for sizes larger than bits in uint64_t");
410 
411   // Populating weight map and index map
412   for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
413     BB->setLayoutIndex(IndexToBB.size());
414     IndexToBB.push_back(BB);
415   }
416   Weight.resize(N);
417   for (const BinaryBasicBlock *BB : BF.getLayout().blocks()) {
418     auto BI = BB->branch_info_begin();
419     Weight[BB->getLayoutIndex()].resize(N);
420     for (BinaryBasicBlock *SuccBB : BB->successors()) {
421       if (BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE)
422         Weight[BB->getLayoutIndex()][SuccBB->getLayoutIndex()] = BI->Count;
423       ++BI;
424     }
425   }
426 
427   std::vector<std::vector<int64_t>> DP;
428   DP.resize(static_cast<size_t>(1) << N);
429   for (std::vector<int64_t> &Elmt : DP)
430     Elmt.resize(N, -1);
431 
432   // Start with the entry basic block being allocated with cost zero
433   DP[1][0] = 0;
434   // Walk through TSP solutions using a bitmask to represent state (current set
435   // of BBs in the layout)
436   uint64_t BestSet = 1;
437   uint64_t BestLast = 0;
438   int64_t BestWeight = 0;
439   for (uint64_t Set = 1; Set < (1ULL << N); ++Set) {
440     // Traverse each possibility of Last BB visited in this layout
441     for (uint64_t Last = 0; Last < N; ++Last) {
442       // Case 1: There is no possible layout with this BB as Last
443       if (DP[Set][Last] == -1)
444         continue;
445 
446       // Case 2: There is a layout with this Set and this Last, and we try
447       // to expand this set with New
448       for (uint64_t New = 1; New < N; ++New) {
449         // Case 2a: BB "New" is already in this Set
450         if ((Set & (1ULL << New)) != 0)
451           continue;
452 
453         // Case 2b: BB "New" is not in this set and we add it to this Set and
454         // record total weight of this layout with "New" as the last BB.
455         uint64_t NewSet = (Set | (1ULL << New));
456         if (DP[NewSet][New] == -1)
457           DP[NewSet][New] = DP[Set][Last] + (int64_t)Weight[Last][New];
458         DP[NewSet][New] = std::max(DP[NewSet][New],
459                                    DP[Set][Last] + (int64_t)Weight[Last][New]);
460 
461         if (DP[NewSet][New] > BestWeight) {
462           BestWeight = DP[NewSet][New];
463           BestSet = NewSet;
464           BestLast = New;
465         }
466       }
467     }
468   }
469 
470   // Define final function layout based on layout that maximizes weight
471   uint64_t Last = BestLast;
472   uint64_t Set = BestSet;
473   BitVector Visited;
474   Visited.resize(N);
475   Visited[Last] = true;
476   Order.push_back(IndexToBB[Last]);
477   Set = Set & ~(1ULL << Last);
478   while (Set != 0) {
479     int64_t Best = -1;
480     uint64_t NewLast;
481     for (uint64_t I = 0; I < N; ++I) {
482       if (DP[Set][I] == -1)
483         continue;
484       int64_t AdjWeight = Weight[I][Last] > 0 ? Weight[I][Last] : 0;
485       if (DP[Set][I] + AdjWeight > Best) {
486         NewLast = I;
487         Best = DP[Set][I] + AdjWeight;
488       }
489     }
490     Last = NewLast;
491     Visited[Last] = true;
492     Order.push_back(IndexToBB[Last]);
493     Set = Set & ~(1ULL << Last);
494   }
495   std::reverse(Order.begin(), Order.end());
496 
497   // Finalize layout with BBs that weren't assigned to the layout using the
498   // input layout.
499   for (BinaryBasicBlock *BB : BF.getLayout().blocks())
500     if (Visited[BB->getLayoutIndex()] == false)
501       Order.push_back(BB);
502 }
503 
reorderBasicBlocks(BinaryFunction & BF,BasicBlockOrder & Order) const504 void ExtTSPReorderAlgorithm::reorderBasicBlocks(BinaryFunction &BF,
505                                                 BasicBlockOrder &Order) const {
506   if (BF.getLayout().block_empty())
507     return;
508 
509   // Do not change layout of functions w/o profile information
510   if (!BF.hasValidProfile() || BF.getLayout().block_size() <= 2) {
511     for (BinaryBasicBlock *BB : BF.getLayout().blocks())
512       Order.push_back(BB);
513     return;
514   }
515 
516   // Create a separate MCCodeEmitter to allow lock-free execution
517   BinaryContext::IndependentCodeEmitter Emitter;
518   if (!opts::NoThreads)
519     Emitter = BF.getBinaryContext().createIndependentMCCodeEmitter();
520 
521   // Initialize CFG nodes and their data
522   std::vector<uint64_t> BlockSizes;
523   std::vector<uint64_t> BlockCounts;
524   BasicBlockOrder OrigOrder;
525   BF.getLayout().updateLayoutIndices();
526   for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
527     uint64_t Size = std::max<uint64_t>(BB->estimateSize(Emitter.MCE.get()), 1);
528     BlockSizes.push_back(Size);
529     BlockCounts.push_back(BB->getKnownExecutionCount());
530     OrigOrder.push_back(BB);
531   }
532 
533   // Initialize CFG edges
534   std::vector<codelayout::EdgeCount> JumpCounts;
535   for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
536     auto BI = BB->branch_info_begin();
537     for (BinaryBasicBlock *SuccBB : BB->successors()) {
538       assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
539              "missing profile for a jump");
540       JumpCounts.push_back(
541           {BB->getLayoutIndex(), SuccBB->getLayoutIndex(), BI->Count});
542       ++BI;
543     }
544   }
545 
546   // Run the layout algorithm
547   auto Result =
548       codelayout::computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
549   Order.reserve(BF.getLayout().block_size());
550   for (uint64_t R : Result)
551     Order.push_back(OrigOrder[R]);
552 }
553 
reorderBasicBlocks(BinaryFunction & BF,BasicBlockOrder & Order) const554 void OptimizeReorderAlgorithm::reorderBasicBlocks(
555     BinaryFunction &BF, BasicBlockOrder &Order) const {
556   if (BF.getLayout().block_empty())
557     return;
558 
559   // Cluster basic blocks.
560   CAlgo->clusterBasicBlocks(BF);
561 
562   if (opts::PrintClusters)
563     CAlgo->printClusters();
564 
565   // Arrange basic blocks according to clusters.
566   for (ClusterAlgorithm::ClusterTy &Cluster : CAlgo->Clusters)
567     Order.insert(Order.end(), Cluster.begin(), Cluster.end());
568 }
569 
reorderBasicBlocks(BinaryFunction & BF,BasicBlockOrder & Order) const570 void OptimizeBranchReorderAlgorithm::reorderBasicBlocks(
571     BinaryFunction &BF, BasicBlockOrder &Order) const {
572   if (BF.getLayout().block_empty())
573     return;
574 
575   // Cluster basic blocks.
576   CAlgo->clusterBasicBlocks(BF, /* ComputeEdges = */ true);
577   std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
578   std::vector<std::unordered_map<uint32_t, uint64_t>> &ClusterEdges =
579       CAlgo->ClusterEdges;
580 
581   // Compute clusters' average frequencies.
582   CAlgo->computeClusterAverageFrequency(BF.getBinaryContext());
583   std::vector<double> &AvgFreq = CAlgo->AvgFreq;
584 
585   if (opts::PrintClusters)
586     CAlgo->printClusters();
587 
588   // Cluster layout order
589   std::vector<uint32_t> ClusterOrder;
590 
591   // Do a topological sort for clusters, prioritizing frequently-executed BBs
592   // during the traversal.
593   std::stack<uint32_t> Stack;
594   std::vector<uint32_t> Status;
595   std::vector<uint32_t> Parent;
596   Status.resize(Clusters.size(), 0);
597   Parent.resize(Clusters.size(), 0);
598   constexpr uint32_t STACKED = 1;
599   constexpr uint32_t VISITED = 2;
600   Status[0] = STACKED;
601   Stack.push(0);
602   while (!Stack.empty()) {
603     uint32_t I = Stack.top();
604     if (!(Status[I] & VISITED)) {
605       Status[I] |= VISITED;
606       // Order successors by weight
607       auto ClusterComp = [&ClusterEdges, I](uint32_t A, uint32_t B) {
608         return ClusterEdges[I][A] > ClusterEdges[I][B];
609       };
610       std::priority_queue<uint32_t, std::vector<uint32_t>,
611                           decltype(ClusterComp)>
612           SuccQueue(ClusterComp);
613       for (std::pair<const uint32_t, uint64_t> &Target : ClusterEdges[I]) {
614         if (Target.second > 0 && !(Status[Target.first] & STACKED) &&
615             !Clusters[Target.first].empty()) {
616           Parent[Target.first] = I;
617           Status[Target.first] = STACKED;
618           SuccQueue.push(Target.first);
619         }
620       }
621       while (!SuccQueue.empty()) {
622         Stack.push(SuccQueue.top());
623         SuccQueue.pop();
624       }
625       continue;
626     }
627     // Already visited this node
628     Stack.pop();
629     ClusterOrder.push_back(I);
630   }
631   std::reverse(ClusterOrder.begin(), ClusterOrder.end());
632   // Put unreachable clusters at the end
633   for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
634     if (!(Status[I] & VISITED) && !Clusters[I].empty())
635       ClusterOrder.push_back(I);
636 
637   // Sort nodes with equal precedence
638   auto Beg = ClusterOrder.begin();
639   // Don't reorder the first cluster, which contains the function entry point
640   ++Beg;
641   std::stable_sort(Beg, ClusterOrder.end(),
642                    [&AvgFreq, &Parent](uint32_t A, uint32_t B) {
643                      uint32_t P = Parent[A];
644                      while (Parent[P] != 0) {
645                        if (Parent[P] == B)
646                          return false;
647                        P = Parent[P];
648                      }
649                      P = Parent[B];
650                      while (Parent[P] != 0) {
651                        if (Parent[P] == A)
652                          return true;
653                        P = Parent[P];
654                      }
655                      return AvgFreq[A] > AvgFreq[B];
656                    });
657 
658   if (opts::PrintClusters) {
659     errs() << "New cluster order: ";
660     const char *Sep = "";
661     for (uint32_t O : ClusterOrder) {
662       errs() << Sep << O;
663       Sep = ", ";
664     }
665     errs() << '\n';
666   }
667 
668   // Arrange basic blocks according to cluster order.
669   for (uint32_t ClusterIndex : ClusterOrder) {
670     ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
671     Order.insert(Order.end(), Cluster.begin(), Cluster.end());
672   }
673 }
674 
reorderBasicBlocks(BinaryFunction & BF,BasicBlockOrder & Order) const675 void OptimizeCacheReorderAlgorithm::reorderBasicBlocks(
676     BinaryFunction &BF, BasicBlockOrder &Order) const {
677   if (BF.getLayout().block_empty())
678     return;
679 
680   const uint64_t ColdThreshold =
681       opts::ColdThreshold *
682       (*BF.getLayout().block_begin())->getExecutionCount() / 1000;
683 
684   // Cluster basic blocks.
685   CAlgo->clusterBasicBlocks(BF);
686   std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
687 
688   // Compute clusters' average frequencies.
689   CAlgo->computeClusterAverageFrequency(BF.getBinaryContext());
690   std::vector<double> &AvgFreq = CAlgo->AvgFreq;
691 
692   if (opts::PrintClusters)
693     CAlgo->printClusters();
694 
695   // Cluster layout order
696   std::vector<uint32_t> ClusterOrder;
697 
698   // Order clusters based on average instruction execution frequency
699   for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
700     if (!Clusters[I].empty())
701       ClusterOrder.push_back(I);
702   // Don't reorder the first cluster, which contains the function entry point
703   std::stable_sort(
704       std::next(ClusterOrder.begin()), ClusterOrder.end(),
705       [&AvgFreq](uint32_t A, uint32_t B) { return AvgFreq[A] > AvgFreq[B]; });
706 
707   if (opts::PrintClusters) {
708     errs() << "New cluster order: ";
709     const char *Sep = "";
710     for (uint32_t O : ClusterOrder) {
711       errs() << Sep << O;
712       Sep = ", ";
713     }
714     errs() << '\n';
715   }
716 
717   // Arrange basic blocks according to cluster order.
718   for (uint32_t ClusterIndex : ClusterOrder) {
719     ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
720     Order.insert(Order.end(), Cluster.begin(), Cluster.end());
721     // Force zero execution count on clusters that do not meet the cut off
722     // specified by --cold-threshold.
723     if (AvgFreq[ClusterIndex] < static_cast<double>(ColdThreshold))
724       for (BinaryBasicBlock *BBPtr : Cluster)
725         BBPtr->setExecutionCount(0);
726   }
727 }
728 
reorderBasicBlocks(BinaryFunction & BF,BasicBlockOrder & Order) const729 void ReverseReorderAlgorithm::reorderBasicBlocks(BinaryFunction &BF,
730                                                  BasicBlockOrder &Order) const {
731   if (BF.getLayout().block_empty())
732     return;
733 
734   BinaryBasicBlock *FirstBB = *BF.getLayout().block_begin();
735   Order.push_back(FirstBB);
736   for (auto RLI = BF.getLayout().block_rbegin(); *RLI != FirstBB; ++RLI)
737     Order.push_back(*RLI);
738 }
739 
reorderBasicBlocks(BinaryFunction & BF,BasicBlockOrder & Order) const740 void RandomClusterReorderAlgorithm::reorderBasicBlocks(
741     BinaryFunction &BF, BasicBlockOrder &Order) const {
742   if (BF.getLayout().block_empty())
743     return;
744 
745   // Cluster basic blocks.
746   CAlgo->clusterBasicBlocks(BF);
747   std::vector<ClusterAlgorithm::ClusterTy> &Clusters = CAlgo->Clusters;
748 
749   if (opts::PrintClusters)
750     CAlgo->printClusters();
751 
752   // Cluster layout order
753   std::vector<uint32_t> ClusterOrder;
754 
755   // Order clusters based on average instruction execution frequency
756   for (uint32_t I = 0, E = Clusters.size(); I < E; ++I)
757     if (!Clusters[I].empty())
758       ClusterOrder.push_back(I);
759 
760   std::shuffle(std::next(ClusterOrder.begin()), ClusterOrder.end(),
761                std::default_random_engine(opts::RandomSeed.getValue()));
762 
763   if (opts::PrintClusters) {
764     errs() << "New cluster order: ";
765     const char *Sep = "";
766     for (uint32_t O : ClusterOrder) {
767       errs() << Sep << O;
768       Sep = ", ";
769     }
770     errs() << '\n';
771   }
772 
773   // Arrange basic blocks according to cluster order.
774   for (uint32_t ClusterIndex : ClusterOrder) {
775     ClusterAlgorithm::ClusterTy &Cluster = Clusters[ClusterIndex];
776     Order.insert(Order.end(), Cluster.begin(), Cluster.end());
777   }
778 }
779