xref: /llvm-project/llvm/lib/Transforms/Utils/CodeLayout.cpp (revision af935cf0ee8a26dcf0860ed0c72d74b4a1cb1314)
1 //===- CodeLayout.cpp - Implementation of code layout algorithms ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // The file implements "cache-aware" layout algorithms of basic blocks and
10 // functions in a binary.
11 //
12 // The algorithm tries to find a layout of nodes (basic blocks) of a given CFG
13 // optimizing jump locality and thus processor I-cache utilization. This is
14 // achieved via increasing the number of fall-through jumps and co-locating
15 // frequently executed nodes together. The name follows the underlying
16 // optimization problem, Extended-TSP, which is a generalization of classical
17 // (maximum) Traveling Salesmen Problem.
18 //
19 // The algorithm is a greedy heuristic that works with chains (ordered lists)
20 // of basic blocks. Initially all chains are isolated basic blocks. On every
21 // iteration, we pick a pair of chains whose merging yields the biggest increase
22 // in the ExtTSP score, which models how i-cache "friendly" a specific chain is.
23 // A pair of chains giving the maximum gain is merged into a new chain. The
24 // procedure stops when there is only one chain left, or when merging does not
25 // increase ExtTSP. In the latter case, the remaining chains are sorted by
26 // density in the decreasing order.
27 //
28 // An important aspect is the way two chains are merged. Unlike earlier
29 // algorithms (e.g., based on the approach of Pettis-Hansen), two
30 // chains, X and Y, are first split into three, X1, X2, and Y. Then we
31 // consider all possible ways of gluing the three chains (e.g., X1YX2, X1X2Y,
32 // X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the largest score.
33 // This improves the quality of the final result (the search space is larger)
34 // while keeping the implementation sufficiently fast.
35 //
36 // Reference:
37 //   * A. Newell and S. Pupyrev, Improved Basic Block Reordering,
38 //     IEEE Transactions on Computers, 2020
39 //     https://arxiv.org/abs/1809.04676
40 //
41 //===----------------------------------------------------------------------===//
42 
43 #include "llvm/Transforms/Utils/CodeLayout.h"
44 #include "llvm/Support/CommandLine.h"
45 #include "llvm/Support/Debug.h"
46 
47 #include <cmath>
48 #include <set>
49 
50 using namespace llvm;
51 #define DEBUG_TYPE "code-layout"
52 
53 namespace llvm {
54 cl::opt<bool> EnableExtTspBlockPlacement(
55     "enable-ext-tsp-block-placement", cl::Hidden, cl::init(false),
56     cl::desc("Enable machine block placement based on the ext-tsp model, "
57              "optimizing I-cache utilization."));
58 
59 cl::opt<bool> ApplyExtTspWithoutProfile(
60     "ext-tsp-apply-without-profile",
61     cl::desc("Whether to apply ext-tsp placement for instances w/o profile"),
62     cl::init(true), cl::Hidden);
63 } // namespace llvm
64 
65 // Algorithm-specific params for Ext-TSP. The values are tuned for the best
66 // performance of large-scale front-end bound binaries.
67 static cl::opt<double> ForwardWeightCond(
68     "ext-tsp-forward-weight-cond", cl::ReallyHidden, cl::init(0.1),
69     cl::desc("The weight of conditional forward jumps for ExtTSP value"));
70 
71 static cl::opt<double> ForwardWeightUncond(
72     "ext-tsp-forward-weight-uncond", cl::ReallyHidden, cl::init(0.1),
73     cl::desc("The weight of unconditional forward jumps for ExtTSP value"));
74 
75 static cl::opt<double> BackwardWeightCond(
76     "ext-tsp-backward-weight-cond", cl::ReallyHidden, cl::init(0.1),
77     cl::desc("The weight of conditional backward jumps for ExtTSP value"));
78 
79 static cl::opt<double> BackwardWeightUncond(
80     "ext-tsp-backward-weight-uncond", cl::ReallyHidden, cl::init(0.1),
81     cl::desc("The weight of unconditional backward jumps for ExtTSP value"));
82 
83 static cl::opt<double> FallthroughWeightCond(
84     "ext-tsp-fallthrough-weight-cond", cl::ReallyHidden, cl::init(1.0),
85     cl::desc("The weight of conditional fallthrough jumps for ExtTSP value"));
86 
87 static cl::opt<double> FallthroughWeightUncond(
88     "ext-tsp-fallthrough-weight-uncond", cl::ReallyHidden, cl::init(1.05),
89     cl::desc("The weight of unconditional fallthrough jumps for ExtTSP value"));
90 
91 static cl::opt<unsigned> ForwardDistance(
92     "ext-tsp-forward-distance", cl::ReallyHidden, cl::init(1024),
93     cl::desc("The maximum distance (in bytes) of a forward jump for ExtTSP"));
94 
95 static cl::opt<unsigned> BackwardDistance(
96     "ext-tsp-backward-distance", cl::ReallyHidden, cl::init(640),
97     cl::desc("The maximum distance (in bytes) of a backward jump for ExtTSP"));
98 
99 // The maximum size of a chain created by the algorithm. The size is bounded
100 // so that the algorithm can efficiently process extremely large instance.
101 static cl::opt<unsigned>
102     MaxChainSize("ext-tsp-max-chain-size", cl::ReallyHidden, cl::init(4096),
103                  cl::desc("The maximum size of a chain to create."));
104 
105 // The maximum size of a chain for splitting. Larger values of the threshold
106 // may yield better quality at the cost of worsen run-time.
107 static cl::opt<unsigned> ChainSplitThreshold(
108     "ext-tsp-chain-split-threshold", cl::ReallyHidden, cl::init(128),
109     cl::desc("The maximum size of a chain to apply splitting"));
110 
111 // The option enables splitting (large) chains along in-coming and out-going
112 // jumps. This typically results in a better quality.
113 static cl::opt<bool> EnableChainSplitAlongJumps(
114     "ext-tsp-enable-chain-split-along-jumps", cl::ReallyHidden, cl::init(true),
115     cl::desc("The maximum size of a chain to apply splitting"));
116 
117 // Algorithm-specific options for CDS.
118 static cl::opt<unsigned> CacheEntries("cds-cache-entries", cl::ReallyHidden,
119                                       cl::desc("The size of the cache"));
120 
121 static cl::opt<unsigned> CacheSize("cds-cache-size", cl::ReallyHidden,
122                                    cl::desc("The size of a line in the cache"));
123 
124 static cl::opt<double> DistancePower(
125     "cds-distance-power", cl::ReallyHidden,
126     cl::desc("The power exponent for the distance-based locality"));
127 
128 static cl::opt<double> FrequencyScale(
129     "cds-frequency-scale", cl::ReallyHidden,
130     cl::desc("The scale factor for the frequency-based locality"));
131 
132 namespace {
133 
134 // Epsilon for comparison of doubles.
135 constexpr double EPS = 1e-8;
136 
137 // Compute the Ext-TSP score for a given jump.
138 double jumpExtTSPScore(uint64_t JumpDist, uint64_t JumpMaxDist, uint64_t Count,
139                        double Weight) {
140   if (JumpDist > JumpMaxDist)
141     return 0;
142   double Prob = 1.0 - static_cast<double>(JumpDist) / JumpMaxDist;
143   return Weight * Prob * Count;
144 }
145 
146 // Compute the Ext-TSP score for a jump between a given pair of blocks,
147 // using their sizes, (estimated) addresses and the jump execution count.
148 double extTSPScore(uint64_t SrcAddr, uint64_t SrcSize, uint64_t DstAddr,
149                    uint64_t Count, bool IsConditional) {
150   // Fallthrough
151   if (SrcAddr + SrcSize == DstAddr) {
152     return jumpExtTSPScore(0, 1, Count,
153                            IsConditional ? FallthroughWeightCond
154                                          : FallthroughWeightUncond);
155   }
156   // Forward
157   if (SrcAddr + SrcSize < DstAddr) {
158     const uint64_t Dist = DstAddr - (SrcAddr + SrcSize);
159     return jumpExtTSPScore(Dist, ForwardDistance, Count,
160                            IsConditional ? ForwardWeightCond
161                                          : ForwardWeightUncond);
162   }
163   // Backward
164   const uint64_t Dist = SrcAddr + SrcSize - DstAddr;
165   return jumpExtTSPScore(Dist, BackwardDistance, Count,
166                          IsConditional ? BackwardWeightCond
167                                        : BackwardWeightUncond);
168 }
169 
170 /// A type of merging two chains, X and Y. The former chain is split into
171 /// X1 and X2 and then concatenated with Y in the order specified by the type.
172 enum class MergeTypeT : int { X_Y, Y_X, X1_Y_X2, Y_X2_X1, X2_X1_Y };
173 
174 /// The gain of merging two chains, that is, the Ext-TSP score of the merge
175 /// together with the corresponding merge 'type' and 'offset'.
176 struct MergeGainT {
177   explicit MergeGainT() = default;
178   explicit MergeGainT(double Score, size_t MergeOffset, MergeTypeT MergeType)
179       : Score(Score), MergeOffset(MergeOffset), MergeType(MergeType) {}
180 
181   double score() const { return Score; }
182 
183   size_t mergeOffset() const { return MergeOffset; }
184 
185   MergeTypeT mergeType() const { return MergeType; }
186 
187   void setMergeType(MergeTypeT Ty) { MergeType = Ty; }
188 
189   // Returns 'true' iff Other is preferred over this.
190   bool operator<(const MergeGainT &Other) const {
191     return (Other.Score > EPS && Other.Score > Score + EPS);
192   }
193 
194   // Update the current gain if Other is preferred over this.
195   void updateIfLessThan(const MergeGainT &Other) {
196     if (*this < Other)
197       *this = Other;
198   }
199 
200 private:
201   double Score{-1.0};
202   size_t MergeOffset{0};
203   MergeTypeT MergeType{MergeTypeT::X_Y};
204 };
205 
206 struct JumpT;
207 struct ChainT;
208 struct ChainEdge;
209 
210 /// A node in the graph, typically corresponding to a basic block in the CFG or
211 /// a function in the call graph.
212 struct NodeT {
213   NodeT(const NodeT &) = delete;
214   NodeT(NodeT &&) = default;
215   NodeT &operator=(const NodeT &) = delete;
216   NodeT &operator=(NodeT &&) = default;
217 
218   explicit NodeT(size_t Index, uint64_t Size, uint64_t EC)
219       : Index(Index), Size(Size), ExecutionCount(EC) {}
220 
221   bool isEntry() const { return Index == 0; }
222 
223   // The total execution count of outgoing jumps.
224   uint64_t outCount() const;
225 
226   // The total execution count of incoming jumps.
227   uint64_t inCount() const;
228 
229   // The original index of the node in graph.
230   size_t Index{0};
231   // The index of the node in the current chain.
232   size_t CurIndex{0};
233   // The size of the node in the binary.
234   uint64_t Size{0};
235   // The execution count of the node in the profile data.
236   uint64_t ExecutionCount{0};
237   // The current chain of the node.
238   ChainT *CurChain{nullptr};
239   // The offset of the node in the current chain.
240   mutable uint64_t EstimatedAddr{0};
241   // Forced successor of the node in the graph.
242   NodeT *ForcedSucc{nullptr};
243   // Forced predecessor of the node in the graph.
244   NodeT *ForcedPred{nullptr};
245   // Outgoing jumps from the node.
246   std::vector<JumpT *> OutJumps;
247   // Incoming jumps to the node.
248   std::vector<JumpT *> InJumps;
249 };
250 
251 /// An arc in the graph, typically corresponding to a jump between two nodes.
252 struct JumpT {
253   JumpT(const JumpT &) = delete;
254   JumpT(JumpT &&) = default;
255   JumpT &operator=(const JumpT &) = delete;
256   JumpT &operator=(JumpT &&) = default;
257 
258   explicit JumpT(NodeT *Source, NodeT *Target, uint64_t ExecutionCount)
259       : Source(Source), Target(Target), ExecutionCount(ExecutionCount) {}
260 
261   // Source node of the jump.
262   NodeT *Source;
263   // Target node of the jump.
264   NodeT *Target;
265   // Execution count of the arc in the profile data.
266   uint64_t ExecutionCount{0};
267   // Whether the jump corresponds to a conditional branch.
268   bool IsConditional{false};
269   // The offset of the jump from the source node.
270   uint64_t Offset{0};
271 };
272 
273 /// A chain (ordered sequence) of nodes in the graph.
274 struct ChainT {
275   ChainT(const ChainT &) = delete;
276   ChainT(ChainT &&) = default;
277   ChainT &operator=(const ChainT &) = delete;
278   ChainT &operator=(ChainT &&) = default;
279 
280   explicit ChainT(uint64_t Id, NodeT *Node)
281       : Id(Id), ExecutionCount(Node->ExecutionCount), Size(Node->Size),
282         Nodes(1, Node) {}
283 
284   size_t numBlocks() const { return Nodes.size(); }
285 
286   double density() const { return static_cast<double>(ExecutionCount) / Size; }
287 
288   bool isEntry() const { return Nodes[0]->Index == 0; }
289 
290   bool isCold() const {
291     for (NodeT *Node : Nodes) {
292       if (Node->ExecutionCount > 0)
293         return false;
294     }
295     return true;
296   }
297 
298   ChainEdge *getEdge(ChainT *Other) const {
299     for (const auto &[Chain, ChainEdge] : Edges) {
300       if (Chain == Other)
301         return ChainEdge;
302     }
303     return nullptr;
304   }
305 
306   void removeEdge(ChainT *Other) {
307     auto It = Edges.begin();
308     while (It != Edges.end()) {
309       if (It->first == Other) {
310         Edges.erase(It);
311         return;
312       }
313       It++;
314     }
315   }
316 
317   void addEdge(ChainT *Other, ChainEdge *Edge) {
318     Edges.push_back(std::make_pair(Other, Edge));
319   }
320 
321   void merge(ChainT *Other, const std::vector<NodeT *> &MergedBlocks) {
322     Nodes = MergedBlocks;
323     // Update the chain's data.
324     ExecutionCount += Other->ExecutionCount;
325     Size += Other->Size;
326     Id = Nodes[0]->Index;
327     // Update the node's data.
328     for (size_t Idx = 0; Idx < Nodes.size(); Idx++) {
329       Nodes[Idx]->CurChain = this;
330       Nodes[Idx]->CurIndex = Idx;
331     }
332   }
333 
334   void mergeEdges(ChainT *Other);
335 
336   void clear() {
337     Nodes.clear();
338     Nodes.shrink_to_fit();
339     Edges.clear();
340     Edges.shrink_to_fit();
341   }
342 
343   // Unique chain identifier.
344   uint64_t Id;
345   // Cached ext-tsp score for the chain.
346   double Score{0};
347   // The total execution count of the chain.
348   uint64_t ExecutionCount{0};
349   // The total size of the chain.
350   uint64_t Size{0};
351   // Nodes of the chain.
352   std::vector<NodeT *> Nodes;
353   // Adjacent chains and corresponding edges (lists of jumps).
354   std::vector<std::pair<ChainT *, ChainEdge *>> Edges;
355 };
356 
357 /// An edge in the graph representing jumps between two chains.
358 /// When nodes are merged into chains, the edges are combined too so that
359 /// there is always at most one edge between a pair of chains.
360 struct ChainEdge {
361   ChainEdge(const ChainEdge &) = delete;
362   ChainEdge(ChainEdge &&) = default;
363   ChainEdge &operator=(const ChainEdge &) = delete;
364   ChainEdge &operator=(ChainEdge &&) = delete;
365 
366   explicit ChainEdge(JumpT *Jump)
367       : SrcChain(Jump->Source->CurChain), DstChain(Jump->Target->CurChain),
368         Jumps(1, Jump) {}
369 
370   ChainT *srcChain() const { return SrcChain; }
371 
372   ChainT *dstChain() const { return DstChain; }
373 
374   bool isSelfEdge() const { return SrcChain == DstChain; }
375 
376   const std::vector<JumpT *> &jumps() const { return Jumps; }
377 
378   void appendJump(JumpT *Jump) { Jumps.push_back(Jump); }
379 
380   void moveJumps(ChainEdge *Other) {
381     Jumps.insert(Jumps.end(), Other->Jumps.begin(), Other->Jumps.end());
382     Other->Jumps.clear();
383     Other->Jumps.shrink_to_fit();
384   }
385 
386   void changeEndpoint(ChainT *From, ChainT *To) {
387     if (From == SrcChain)
388       SrcChain = To;
389     if (From == DstChain)
390       DstChain = To;
391   }
392 
393   bool hasCachedMergeGain(ChainT *Src, ChainT *Dst) const {
394     return Src == SrcChain ? CacheValidForward : CacheValidBackward;
395   }
396 
397   MergeGainT getCachedMergeGain(ChainT *Src, ChainT *Dst) const {
398     return Src == SrcChain ? CachedGainForward : CachedGainBackward;
399   }
400 
401   void setCachedMergeGain(ChainT *Src, ChainT *Dst, MergeGainT MergeGain) {
402     if (Src == SrcChain) {
403       CachedGainForward = MergeGain;
404       CacheValidForward = true;
405     } else {
406       CachedGainBackward = MergeGain;
407       CacheValidBackward = true;
408     }
409   }
410 
411   void invalidateCache() {
412     CacheValidForward = false;
413     CacheValidBackward = false;
414   }
415 
416   void setMergeGain(MergeGainT Gain) { CachedGain = Gain; }
417 
418   MergeGainT getMergeGain() const { return CachedGain; }
419 
420   double gain() const { return CachedGain.score(); }
421 
422 private:
423   // Source chain.
424   ChainT *SrcChain{nullptr};
425   // Destination chain.
426   ChainT *DstChain{nullptr};
427   // Original jumps in the binary with corresponding execution counts.
428   std::vector<JumpT *> Jumps;
429   // Cached gain value for merging the pair of chains.
430   MergeGainT CachedGain;
431 
432   // Cached gain values for merging the pair of chains. Since the gain of
433   // merging (Src, Dst) and (Dst, Src) might be different, we store both values
434   // here and a flag indicating which of the options results in a higher gain.
435   // Cached gain values.
436   MergeGainT CachedGainForward;
437   MergeGainT CachedGainBackward;
438   // Whether the cached value must be recomputed.
439   bool CacheValidForward{false};
440   bool CacheValidBackward{false};
441 };
442 
443 uint64_t NodeT::outCount() const {
444   uint64_t Count = 0;
445   for (JumpT *Jump : OutJumps)
446     Count += Jump->ExecutionCount;
447   return Count;
448 }
449 
450 uint64_t NodeT::inCount() const {
451   uint64_t Count = 0;
452   for (JumpT *Jump : InJumps)
453     Count += Jump->ExecutionCount;
454   return Count;
455 }
456 
457 void ChainT::mergeEdges(ChainT *Other) {
458   // Update edges adjacent to chain Other.
459   for (const auto &[DstChain, DstEdge] : Other->Edges) {
460     ChainT *TargetChain = DstChain == Other ? this : DstChain;
461     ChainEdge *CurEdge = getEdge(TargetChain);
462     if (CurEdge == nullptr) {
463       DstEdge->changeEndpoint(Other, this);
464       this->addEdge(TargetChain, DstEdge);
465       if (DstChain != this && DstChain != Other)
466         DstChain->addEdge(this, DstEdge);
467     } else {
468       CurEdge->moveJumps(DstEdge);
469     }
470     // Cleanup leftover edge.
471     if (DstChain != Other)
472       DstChain->removeEdge(Other);
473   }
474 }
475 
476 using NodeIter = std::vector<NodeT *>::const_iterator;
477 
478 /// A wrapper around three chains of nodes; it is used to avoid extra
479 /// instantiation of the vectors.
480 struct MergedChain {
481   MergedChain(NodeIter Begin1, NodeIter End1, NodeIter Begin2 = NodeIter(),
482               NodeIter End2 = NodeIter(), NodeIter Begin3 = NodeIter(),
483               NodeIter End3 = NodeIter())
484       : Begin1(Begin1), End1(End1), Begin2(Begin2), End2(End2), Begin3(Begin3),
485         End3(End3) {}
486 
487   template <typename F> void forEach(const F &Func) const {
488     for (auto It = Begin1; It != End1; It++)
489       Func(*It);
490     for (auto It = Begin2; It != End2; It++)
491       Func(*It);
492     for (auto It = Begin3; It != End3; It++)
493       Func(*It);
494   }
495 
496   std::vector<NodeT *> getNodes() const {
497     std::vector<NodeT *> Result;
498     Result.reserve(std::distance(Begin1, End1) + std::distance(Begin2, End2) +
499                    std::distance(Begin3, End3));
500     Result.insert(Result.end(), Begin1, End1);
501     Result.insert(Result.end(), Begin2, End2);
502     Result.insert(Result.end(), Begin3, End3);
503     return Result;
504   }
505 
506   const NodeT *getFirstNode() const { return *Begin1; }
507 
508 private:
509   NodeIter Begin1;
510   NodeIter End1;
511   NodeIter Begin2;
512   NodeIter End2;
513   NodeIter Begin3;
514   NodeIter End3;
515 };
516 
517 /// Merge two chains of nodes respecting a given 'type' and 'offset'.
518 ///
519 /// If MergeType == 0, then the result is a concatenation of two chains.
520 /// Otherwise, the first chain is cut into two sub-chains at the offset,
521 /// and merged using all possible ways of concatenating three chains.
522 MergedChain mergeNodes(const std::vector<NodeT *> &X,
523                        const std::vector<NodeT *> &Y, size_t MergeOffset,
524                        MergeTypeT MergeType) {
525   // Split the first chain, X, into X1 and X2.
526   NodeIter BeginX1 = X.begin();
527   NodeIter EndX1 = X.begin() + MergeOffset;
528   NodeIter BeginX2 = X.begin() + MergeOffset;
529   NodeIter EndX2 = X.end();
530   NodeIter BeginY = Y.begin();
531   NodeIter EndY = Y.end();
532 
533   // Construct a new chain from the three existing ones.
534   switch (MergeType) {
535   case MergeTypeT::X_Y:
536     return MergedChain(BeginX1, EndX2, BeginY, EndY);
537   case MergeTypeT::Y_X:
538     return MergedChain(BeginY, EndY, BeginX1, EndX2);
539   case MergeTypeT::X1_Y_X2:
540     return MergedChain(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2);
541   case MergeTypeT::Y_X2_X1:
542     return MergedChain(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1);
543   case MergeTypeT::X2_X1_Y:
544     return MergedChain(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY);
545   }
546   llvm_unreachable("unexpected chain merge type");
547 }
548 
549 /// The implementation of the ExtTSP algorithm.
550 class ExtTSPImpl {
551 public:
552   ExtTSPImpl(const std::vector<uint64_t> &NodeSizes,
553              const std::vector<uint64_t> &NodeCounts,
554              const std::vector<EdgeCountT> &EdgeCounts)
555       : NumNodes(NodeSizes.size()) {
556     initialize(NodeSizes, NodeCounts, EdgeCounts);
557   }
558 
559   /// Run the algorithm and return an optimized ordering of nodes.
560   void run(std::vector<uint64_t> &Result) {
561     // Pass 1: Merge nodes with their mutually forced successors
562     mergeForcedPairs();
563 
564     // Pass 2: Merge pairs of chains while improving the ExtTSP objective
565     mergeChainPairs();
566 
567     // Pass 3: Merge cold nodes to reduce code size
568     mergeColdChains();
569 
570     // Collect nodes from all chains
571     concatChains(Result);
572   }
573 
574 private:
575   /// Initialize the algorithm's data structures.
576   void initialize(const std::vector<uint64_t> &NodeSizes,
577                   const std::vector<uint64_t> &NodeCounts,
578                   const std::vector<EdgeCountT> &EdgeCounts) {
579     // Initialize nodes
580     AllNodes.reserve(NumNodes);
581     for (uint64_t Idx = 0; Idx < NumNodes; Idx++) {
582       uint64_t Size = std::max<uint64_t>(NodeSizes[Idx], 1ULL);
583       uint64_t ExecutionCount = NodeCounts[Idx];
584       // The execution count of the entry node is set to at least one.
585       if (Idx == 0 && ExecutionCount == 0)
586         ExecutionCount = 1;
587       AllNodes.emplace_back(Idx, Size, ExecutionCount);
588     }
589 
590     // Initialize jumps between nodes
591     SuccNodes.resize(NumNodes);
592     PredNodes.resize(NumNodes);
593     std::vector<uint64_t> OutDegree(NumNodes, 0);
594     AllJumps.reserve(EdgeCounts.size());
595     for (auto It : EdgeCounts) {
596       uint64_t Pred = It.first.first;
597       uint64_t Succ = It.first.second;
598       OutDegree[Pred]++;
599       // Ignore self-edges.
600       if (Pred == Succ)
601         continue;
602 
603       SuccNodes[Pred].push_back(Succ);
604       PredNodes[Succ].push_back(Pred);
605       uint64_t ExecutionCount = It.second;
606       if (ExecutionCount > 0) {
607         NodeT &PredNode = AllNodes[Pred];
608         NodeT &SuccNode = AllNodes[Succ];
609         AllJumps.emplace_back(&PredNode, &SuccNode, ExecutionCount);
610         SuccNode.InJumps.push_back(&AllJumps.back());
611         PredNode.OutJumps.push_back(&AllJumps.back());
612       }
613     }
614     for (JumpT &Jump : AllJumps) {
615       assert(OutDegree[Jump.Source->Index] > 0);
616       Jump.IsConditional = OutDegree[Jump.Source->Index] > 1;
617     }
618 
619     // Initialize chains.
620     AllChains.reserve(NumNodes);
621     HotChains.reserve(NumNodes);
622     for (NodeT &Node : AllNodes) {
623       AllChains.emplace_back(Node.Index, &Node);
624       Node.CurChain = &AllChains.back();
625       if (Node.ExecutionCount > 0)
626         HotChains.push_back(&AllChains.back());
627     }
628 
629     // Initialize chain edges.
630     AllEdges.reserve(AllJumps.size());
631     for (NodeT &PredNode : AllNodes) {
632       for (JumpT *Jump : PredNode.OutJumps) {
633         NodeT *SuccNode = Jump->Target;
634         ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain);
635         // this edge is already present in the graph.
636         if (CurEdge != nullptr) {
637           assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr);
638           CurEdge->appendJump(Jump);
639           continue;
640         }
641         // this is a new edge.
642         AllEdges.emplace_back(Jump);
643         PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back());
644         SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back());
645       }
646     }
647   }
648 
649   /// For a pair of nodes, A and B, node B is the forced successor of A,
650   /// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps
651   /// to B are from A. Such nodes should be adjacent in the optimal ordering;
652   /// the method finds and merges such pairs of nodes.
653   void mergeForcedPairs() {
654     // Find fallthroughs based on edge weights.
655     for (NodeT &Node : AllNodes) {
656       if (SuccNodes[Node.Index].size() == 1 &&
657           PredNodes[SuccNodes[Node.Index][0]].size() == 1 &&
658           SuccNodes[Node.Index][0] != 0) {
659         size_t SuccIndex = SuccNodes[Node.Index][0];
660         Node.ForcedSucc = &AllNodes[SuccIndex];
661         AllNodes[SuccIndex].ForcedPred = &Node;
662       }
663     }
664 
665     // There might be 'cycles' in the forced dependencies, since profile
666     // data isn't 100% accurate. Typically this is observed in loops, when the
667     // loop edges are the hottest successors for the basic blocks of the loop.
668     // Break the cycles by choosing the node with the smallest index as the
669     // head. This helps to keep the original order of the loops, which likely
670     // have already been rotated in the optimized manner.
671     for (NodeT &Node : AllNodes) {
672       if (Node.ForcedSucc == nullptr || Node.ForcedPred == nullptr)
673         continue;
674 
675       NodeT *SuccNode = Node.ForcedSucc;
676       while (SuccNode != nullptr && SuccNode != &Node) {
677         SuccNode = SuccNode->ForcedSucc;
678       }
679       if (SuccNode == nullptr)
680         continue;
681       // Break the cycle.
682       AllNodes[Node.ForcedPred->Index].ForcedSucc = nullptr;
683       Node.ForcedPred = nullptr;
684     }
685 
686     // Merge nodes with their fallthrough successors.
687     for (NodeT &Node : AllNodes) {
688       if (Node.ForcedPred == nullptr && Node.ForcedSucc != nullptr) {
689         const NodeT *CurBlock = &Node;
690         while (CurBlock->ForcedSucc != nullptr) {
691           const NodeT *NextBlock = CurBlock->ForcedSucc;
692           mergeChains(Node.CurChain, NextBlock->CurChain, 0, MergeTypeT::X_Y);
693           CurBlock = NextBlock;
694         }
695       }
696     }
697   }
698 
699   /// Merge pairs of chains while improving the ExtTSP objective.
700   void mergeChainPairs() {
701     /// Deterministically compare pairs of chains.
702     auto compareChainPairs = [](const ChainT *A1, const ChainT *B1,
703                                 const ChainT *A2, const ChainT *B2) {
704       if (A1 != A2)
705         return A1->Id < A2->Id;
706       return B1->Id < B2->Id;
707     };
708 
709     while (HotChains.size() > 1) {
710       ChainT *BestChainPred = nullptr;
711       ChainT *BestChainSucc = nullptr;
712       MergeGainT BestGain;
713       // Iterate over all pairs of chains.
714       for (ChainT *ChainPred : HotChains) {
715         // Get candidates for merging with the current chain.
716         for (const auto &[ChainSucc, Edge] : ChainPred->Edges) {
717           // Ignore loop edges.
718           if (ChainPred == ChainSucc)
719             continue;
720 
721           // Stop early if the combined chain violates the maximum allowed size.
722           if (ChainPred->numBlocks() + ChainSucc->numBlocks() >= MaxChainSize)
723             continue;
724 
725           // Compute the gain of merging the two chains.
726           MergeGainT CurGain = getBestMergeGain(ChainPred, ChainSucc, Edge);
727           if (CurGain.score() <= EPS)
728             continue;
729 
730           if (BestGain < CurGain ||
731               (std::abs(CurGain.score() - BestGain.score()) < EPS &&
732                compareChainPairs(ChainPred, ChainSucc, BestChainPred,
733                                  BestChainSucc))) {
734             BestGain = CurGain;
735             BestChainPred = ChainPred;
736             BestChainSucc = ChainSucc;
737           }
738         }
739       }
740 
741       // Stop merging when there is no improvement.
742       if (BestGain.score() <= EPS)
743         break;
744 
745       // Merge the best pair of chains.
746       mergeChains(BestChainPred, BestChainSucc, BestGain.mergeOffset(),
747                   BestGain.mergeType());
748     }
749   }
750 
751   /// Merge remaining nodes into chains w/o taking jump counts into
752   /// consideration. This allows to maintain the original node order in the
753   /// absence of profile data.
754   void mergeColdChains() {
755     for (size_t SrcBB = 0; SrcBB < NumNodes; SrcBB++) {
756       // Iterating in reverse order to make sure original fallthrough jumps are
757       // merged first; this might be beneficial for code size.
758       size_t NumSuccs = SuccNodes[SrcBB].size();
759       for (size_t Idx = 0; Idx < NumSuccs; Idx++) {
760         size_t DstBB = SuccNodes[SrcBB][NumSuccs - Idx - 1];
761         ChainT *SrcChain = AllNodes[SrcBB].CurChain;
762         ChainT *DstChain = AllNodes[DstBB].CurChain;
763         if (SrcChain != DstChain && !DstChain->isEntry() &&
764             SrcChain->Nodes.back()->Index == SrcBB &&
765             DstChain->Nodes.front()->Index == DstBB &&
766             SrcChain->isCold() == DstChain->isCold()) {
767           mergeChains(SrcChain, DstChain, 0, MergeTypeT::X_Y);
768         }
769       }
770     }
771   }
772 
773   /// Compute the Ext-TSP score for a given node order and a list of jumps.
774   double extTSPScore(const MergedChain &MergedBlocks,
775                      const std::vector<JumpT *> &Jumps) const {
776     if (Jumps.empty())
777       return 0.0;
778     uint64_t CurAddr = 0;
779     MergedBlocks.forEach([&](const NodeT *Node) {
780       Node->EstimatedAddr = CurAddr;
781       CurAddr += Node->Size;
782     });
783 
784     double Score = 0;
785     for (JumpT *Jump : Jumps) {
786       const NodeT *SrcBlock = Jump->Source;
787       const NodeT *DstBlock = Jump->Target;
788       Score += ::extTSPScore(SrcBlock->EstimatedAddr, SrcBlock->Size,
789                              DstBlock->EstimatedAddr, Jump->ExecutionCount,
790                              Jump->IsConditional);
791     }
792     return Score;
793   }
794 
795   /// Compute the gain of merging two chains.
796   ///
797   /// The function considers all possible ways of merging two chains and
798   /// computes the one having the largest increase in ExtTSP objective. The
799   /// result is a pair with the first element being the gain and the second
800   /// element being the corresponding merging type.
801   MergeGainT getBestMergeGain(ChainT *ChainPred, ChainT *ChainSucc,
802                               ChainEdge *Edge) const {
803     if (Edge->hasCachedMergeGain(ChainPred, ChainSucc)) {
804       return Edge->getCachedMergeGain(ChainPred, ChainSucc);
805     }
806 
807     // Precompute jumps between ChainPred and ChainSucc.
808     auto Jumps = Edge->jumps();
809     ChainEdge *EdgePP = ChainPred->getEdge(ChainPred);
810     if (EdgePP != nullptr) {
811       Jumps.insert(Jumps.end(), EdgePP->jumps().begin(), EdgePP->jumps().end());
812     }
813     assert(!Jumps.empty() && "trying to merge chains w/o jumps");
814 
815     // This object holds the best chosen gain of merging two chains.
816     MergeGainT Gain = MergeGainT();
817 
818     /// Given a merge offset and a list of merge types, try to merge two chains
819     /// and update Gain with a better alternative.
820     auto tryChainMerging = [&](size_t Offset,
821                                const std::vector<MergeTypeT> &MergeTypes) {
822       // Skip merging corresponding to concatenation w/o splitting.
823       if (Offset == 0 || Offset == ChainPred->Nodes.size())
824         return;
825       // Skip merging if it breaks Forced successors.
826       NodeT *Node = ChainPred->Nodes[Offset - 1];
827       if (Node->ForcedSucc != nullptr)
828         return;
829       // Apply the merge, compute the corresponding gain, and update the best
830       // value, if the merge is beneficial.
831       for (const MergeTypeT &MergeType : MergeTypes) {
832         Gain.updateIfLessThan(
833             computeMergeGain(ChainPred, ChainSucc, Jumps, Offset, MergeType));
834       }
835     };
836 
837     // Try to concatenate two chains w/o splitting.
838     Gain.updateIfLessThan(
839         computeMergeGain(ChainPred, ChainSucc, Jumps, 0, MergeTypeT::X_Y));
840 
841     if (EnableChainSplitAlongJumps) {
842       // Attach (a part of) ChainPred before the first node of ChainSucc.
843       for (JumpT *Jump : ChainSucc->Nodes.front()->InJumps) {
844         const NodeT *SrcBlock = Jump->Source;
845         if (SrcBlock->CurChain != ChainPred)
846           continue;
847         size_t Offset = SrcBlock->CurIndex + 1;
848         tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::X2_X1_Y});
849       }
850 
851       // Attach (a part of) ChainPred after the last node of ChainSucc.
852       for (JumpT *Jump : ChainSucc->Nodes.back()->OutJumps) {
853         const NodeT *DstBlock = Jump->Target;
854         if (DstBlock->CurChain != ChainPred)
855           continue;
856         size_t Offset = DstBlock->CurIndex;
857         tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1});
858       }
859     }
860 
861     // Try to break ChainPred in various ways and concatenate with ChainSucc.
862     if (ChainPred->Nodes.size() <= ChainSplitThreshold) {
863       for (size_t Offset = 1; Offset < ChainPred->Nodes.size(); Offset++) {
864         // Try to split the chain in different ways. In practice, applying
865         // X2_Y_X1 merging is almost never provides benefits; thus, we exclude
866         // it from consideration to reduce the search space.
867         tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1,
868                                  MergeTypeT::X2_X1_Y});
869       }
870     }
871     Edge->setCachedMergeGain(ChainPred, ChainSucc, Gain);
872     return Gain;
873   }
874 
875   /// Compute the score gain of merging two chains, respecting a given
876   /// merge 'type' and 'offset'.
877   ///
878   /// The two chains are not modified in the method.
879   MergeGainT computeMergeGain(const ChainT *ChainPred, const ChainT *ChainSucc,
880                               const std::vector<JumpT *> &Jumps,
881                               size_t MergeOffset, MergeTypeT MergeType) const {
882     auto MergedBlocks =
883         mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType);
884 
885     // Do not allow a merge that does not preserve the original entry point.
886     if ((ChainPred->isEntry() || ChainSucc->isEntry()) &&
887         !MergedBlocks.getFirstNode()->isEntry())
888       return MergeGainT();
889 
890     // The gain for the new chain.
891     auto NewGainScore = extTSPScore(MergedBlocks, Jumps) - ChainPred->Score;
892     return MergeGainT(NewGainScore, MergeOffset, MergeType);
893   }
894 
895   /// Merge chain From into chain Into, update the list of active chains,
896   /// adjacency information, and the corresponding cached values.
897   void mergeChains(ChainT *Into, ChainT *From, size_t MergeOffset,
898                    MergeTypeT MergeType) {
899     assert(Into != From && "a chain cannot be merged with itself");
900 
901     // Merge the nodes.
902     MergedChain MergedNodes =
903         mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType);
904     Into->merge(From, MergedNodes.getNodes());
905 
906     // Merge the edges.
907     Into->mergeEdges(From);
908     From->clear();
909 
910     // Update cached ext-tsp score for the new chain.
911     ChainEdge *SelfEdge = Into->getEdge(Into);
912     if (SelfEdge != nullptr) {
913       MergedNodes = MergedChain(Into->Nodes.begin(), Into->Nodes.end());
914       Into->Score = extTSPScore(MergedNodes, SelfEdge->jumps());
915     }
916 
917     // Remove the chain from the list of active chains.
918     llvm::erase_value(HotChains, From);
919 
920     // Invalidate caches.
921     for (auto EdgeIt : Into->Edges)
922       EdgeIt.second->invalidateCache();
923   }
924 
925   /// Concatenate all chains into the final order.
926   void concatChains(std::vector<uint64_t> &Order) {
927     // Collect chains and calculate density stats for their sorting.
928     std::vector<const ChainT *> SortedChains;
929     DenseMap<const ChainT *, double> ChainDensity;
930     for (ChainT &Chain : AllChains) {
931       if (!Chain.Nodes.empty()) {
932         SortedChains.push_back(&Chain);
933         // Using doubles to avoid overflow of ExecutionCounts.
934         double Size = 0;
935         double ExecutionCount = 0;
936         for (NodeT *Node : Chain.Nodes) {
937           Size += static_cast<double>(Node->Size);
938           ExecutionCount += static_cast<double>(Node->ExecutionCount);
939         }
940         assert(Size > 0 && "a chain of zero size");
941         ChainDensity[&Chain] = ExecutionCount / Size;
942       }
943     }
944 
945     // Sorting chains by density in the decreasing order.
946     std::sort(SortedChains.begin(), SortedChains.end(),
947               [&](const ChainT *L, const ChainT *R) {
948                 // Place the entry point is at the beginning of the order.
949                 if (L->isEntry() != R->isEntry())
950                   return L->isEntry();
951 
952                 const double DL = ChainDensity[L];
953                 const double DR = ChainDensity[R];
954                 // Compare by density and break ties by chain identifiers.
955                 return std::make_tuple(-DL, L->Id) <
956                        std::make_tuple(-DR, R->Id);
957               });
958 
959     // Collect the nodes in the order specified by their chains.
960     Order.reserve(NumNodes);
961     for (const ChainT *Chain : SortedChains) {
962       for (NodeT *Node : Chain->Nodes) {
963         Order.push_back(Node->Index);
964       }
965     }
966   }
967 
968 private:
969   /// The number of nodes in the graph.
970   const size_t NumNodes;
971 
972   /// Successors of each node.
973   std::vector<std::vector<uint64_t>> SuccNodes;
974 
975   /// Predecessors of each node.
976   std::vector<std::vector<uint64_t>> PredNodes;
977 
978   /// All nodes (basic blocks) in the graph.
979   std::vector<NodeT> AllNodes;
980 
981   /// All jumps between the nodes.
982   std::vector<JumpT> AllJumps;
983 
984   /// All chains of nodes.
985   std::vector<ChainT> AllChains;
986 
987   /// All edges between the chains.
988   std::vector<ChainEdge> AllEdges;
989 
990   /// Active chains. The vector gets updated at runtime when chains are merged.
991   std::vector<ChainT *> HotChains;
992 };
993 
994 /// The implementation of the Cache-Directed Sort (CDS) algorithm for ordering
995 /// functions represented by a call graph.
996 class CDSortImpl {
997 public:
998   CDSortImpl(const CDSortConfig &Config, const std::vector<uint64_t> &NodeSizes,
999              const std::vector<uint64_t> &NodeCounts,
1000              const std::vector<EdgeCountT> &EdgeCounts,
1001              const std::vector<uint64_t> &EdgeOffsets)
1002       : Config(Config), NumNodes(NodeSizes.size()) {
1003     initialize(NodeSizes, NodeCounts, EdgeCounts, EdgeOffsets);
1004   }
1005 
1006   /// Run the algorithm and return an ordered set of function clusters.
1007   void run(std::vector<uint64_t> &Result) {
1008     // Merge pairs of chains while improving the objective.
1009     mergeChainPairs();
1010 
1011     LLVM_DEBUG(dbgs() << "Cache-directed function sorting reduced the number"
1012                       << " of chains from " << NumNodes << " to "
1013                       << HotChains.size() << "\n");
1014 
1015     // Collect nodes from all the chains.
1016     concatChains(Result);
1017   }
1018 
1019 private:
1020   /// Initialize the algorithm's data structures.
1021   void initialize(const std::vector<uint64_t> &NodeSizes,
1022                   const std::vector<uint64_t> &NodeCounts,
1023                   const std::vector<EdgeCountT> &EdgeCounts,
1024                   const std::vector<uint64_t> &EdgeOffsets) {
1025     // Initialize nodes.
1026     AllNodes.reserve(NumNodes);
1027     for (uint64_t Node = 0; Node < NumNodes; Node++) {
1028       uint64_t Size = std::max<uint64_t>(NodeSizes[Node], 1ULL);
1029       uint64_t ExecutionCount = NodeCounts[Node];
1030       AllNodes.emplace_back(Node, Size, ExecutionCount);
1031       TotalSamples += ExecutionCount;
1032       if (ExecutionCount > 0)
1033         TotalSize += Size;
1034     }
1035 
1036     // Initialize jumps between the nodes.
1037     SuccNodes.resize(NumNodes);
1038     PredNodes.resize(NumNodes);
1039     AllJumps.reserve(EdgeCounts.size());
1040     for (size_t I = 0; I < EdgeCounts.size(); I++) {
1041       auto It = EdgeCounts[I];
1042       uint64_t Pred = It.first.first;
1043       uint64_t Succ = It.first.second;
1044       // Ignore recursive calls.
1045       if (Pred == Succ)
1046         continue;
1047 
1048       SuccNodes[Pred].push_back(Succ);
1049       PredNodes[Succ].push_back(Pred);
1050       uint64_t ExecutionCount = It.second;
1051       if (ExecutionCount > 0) {
1052         NodeT &PredNode = AllNodes[Pred];
1053         NodeT &SuccNode = AllNodes[Succ];
1054         AllJumps.emplace_back(&PredNode, &SuccNode, ExecutionCount);
1055         AllJumps.back().Offset = EdgeOffsets[I];
1056         SuccNode.InJumps.push_back(&AllJumps.back());
1057         PredNode.OutJumps.push_back(&AllJumps.back());
1058       }
1059     }
1060 
1061     // Initialize chains.
1062     AllChains.reserve(NumNodes);
1063     HotChains.reserve(NumNodes);
1064     for (NodeT &Node : AllNodes) {
1065       // Adjust execution counts.
1066       Node.ExecutionCount = std::max(Node.ExecutionCount, Node.inCount());
1067       Node.ExecutionCount = std::max(Node.ExecutionCount, Node.outCount());
1068       // Create chain.
1069       AllChains.emplace_back(Node.Index, &Node);
1070       Node.CurChain = &AllChains.back();
1071       if (Node.ExecutionCount > 0)
1072         HotChains.push_back(&AllChains.back());
1073     }
1074 
1075     // Initialize chain edges.
1076     AllEdges.reserve(AllJumps.size());
1077     for (NodeT &PredNode : AllNodes) {
1078       for (JumpT *Jump : PredNode.OutJumps) {
1079         NodeT *SuccNode = Jump->Target;
1080         ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain);
1081         // this edge is already present in the graph.
1082         if (CurEdge != nullptr) {
1083           assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr);
1084           CurEdge->appendJump(Jump);
1085           continue;
1086         }
1087         // this is a new edge.
1088         AllEdges.emplace_back(Jump);
1089         PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back());
1090         SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back());
1091       }
1092     }
1093   }
1094 
1095   /// Merge pairs of chains while there is an improvement in the objective.
1096   void mergeChainPairs() {
1097     // Create a priority queue containing all edges ordered by the merge gain.
1098     auto GainComparator = [](ChainEdge *L, ChainEdge *R) {
1099       return std::make_tuple(-L->gain(), L->srcChain()->Id, L->dstChain()->Id) <
1100              std::make_tuple(-R->gain(), R->srcChain()->Id, R->dstChain()->Id);
1101     };
1102     std::set<ChainEdge *, decltype(GainComparator)> Queue(GainComparator);
1103 
1104     // Insert the edges into the queue.
1105     for (ChainT *ChainPred : HotChains) {
1106       for (const auto &[_, Edge] : ChainPred->Edges) {
1107         // Ignore self-edges.
1108         if (Edge->isSelfEdge())
1109           continue;
1110         // Ignore already processed edges.
1111         if (Edge->gain() != -1.0)
1112           continue;
1113 
1114         // Compute the gain of merging the two chains.
1115         MergeGainT Gain = getBestMergeGain(Edge);
1116         Edge->setMergeGain(Gain);
1117 
1118         if (Edge->gain() > EPS)
1119           Queue.insert(Edge);
1120       }
1121     }
1122 
1123     // Merge the chains while the gain of merging is positive.
1124     while (!Queue.empty()) {
1125       // Extract the best (top) edge for merging.
1126       ChainEdge *BestEdge = *Queue.begin();
1127       Queue.erase(Queue.begin());
1128       // Ignore self-edges.
1129       if (BestEdge->isSelfEdge())
1130         continue;
1131       // Ignore edges with non-positive gains.
1132       if (BestEdge->gain() <= EPS)
1133         continue;
1134 
1135       ChainT *BestSrcChain = BestEdge->srcChain();
1136       ChainT *BestDstChain = BestEdge->dstChain();
1137 
1138       // Remove outdated edges from the queue.
1139       for (const auto &[_, ChainEdge] : BestSrcChain->Edges)
1140         Queue.erase(ChainEdge);
1141       for (const auto &[_, ChainEdge] : BestDstChain->Edges)
1142         Queue.erase(ChainEdge);
1143 
1144       // Merge the best pair of chains.
1145       MergeGainT BestGain = BestEdge->getMergeGain();
1146       mergeChains(BestSrcChain, BestDstChain, BestGain.mergeOffset(),
1147                   BestGain.mergeType());
1148 
1149       // Insert newly created edges into the queue.
1150       for (const auto &[_, Edge] : BestSrcChain->Edges) {
1151         // Ignore loop edges.
1152         if (Edge->isSelfEdge())
1153           continue;
1154 
1155         // Compute the gain of merging the two chains.
1156         MergeGainT Gain = getBestMergeGain(Edge);
1157         Edge->setMergeGain(Gain);
1158 
1159         if (Edge->gain() > EPS)
1160           Queue.insert(Edge);
1161       }
1162     }
1163   }
1164 
1165   /// Compute the gain of merging two chains.
1166   ///
1167   /// The function considers all possible ways of merging two chains and
1168   /// computes the one having the largest increase in ExtTSP objective. The
1169   /// result is a pair with the first element being the gain and the second
1170   /// element being the corresponding merging type.
1171   MergeGainT getBestMergeGain(ChainEdge *Edge) const {
1172     // Precompute jumps between ChainPred and ChainSucc.
1173     auto Jumps = Edge->jumps();
1174     assert(!Jumps.empty() && "trying to merge chains w/o jumps");
1175     ChainT *SrcChain = Edge->srcChain();
1176     ChainT *DstChain = Edge->dstChain();
1177 
1178     // This object holds the best currently chosen gain of merging two chains.
1179     MergeGainT Gain = MergeGainT();
1180 
1181     /// Given a list of merge types, try to merge two chains and update Gain
1182     /// with a better alternative.
1183     auto tryChainMerging = [&](const std::vector<MergeTypeT> &MergeTypes) {
1184       // Apply the merge, compute the corresponding gain, and update the best
1185       // value, if the merge is beneficial.
1186       for (const MergeTypeT &MergeType : MergeTypes) {
1187         MergeGainT NewGain =
1188             computeMergeGain(SrcChain, DstChain, Jumps, MergeType);
1189 
1190         // When forward and backward gains are the same, prioritize merging that
1191         // preserves the original order of the functions in the binary.
1192         if (std::abs(Gain.score() - NewGain.score()) < EPS) {
1193           if ((MergeType == MergeTypeT::X_Y && SrcChain->Id < DstChain->Id) ||
1194               (MergeType == MergeTypeT::Y_X && SrcChain->Id > DstChain->Id)) {
1195             Gain = NewGain;
1196           }
1197         } else if (NewGain.score() > Gain.score() + EPS) {
1198           Gain = NewGain;
1199         }
1200       }
1201     };
1202 
1203     // Try to concatenate two chains w/o splitting.
1204     tryChainMerging({MergeTypeT::X_Y, MergeTypeT::Y_X});
1205 
1206     return Gain;
1207   }
1208 
1209   /// Compute the score gain of merging two chains, respecting a given type.
1210   ///
1211   /// The two chains are not modified in the method.
1212   MergeGainT computeMergeGain(ChainT *ChainPred, ChainT *ChainSucc,
1213                               const std::vector<JumpT *> &Jumps,
1214                               MergeTypeT MergeType) const {
1215     // This doesn't depend on the ordering of the nodes
1216     double FreqGain = freqBasedLocalityGain(ChainPred, ChainSucc);
1217 
1218     // Merge offset is always 0, as the chains are not split.
1219     size_t MergeOffset = 0;
1220     auto MergedBlocks =
1221         mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType);
1222     double DistGain = distBasedLocalityGain(MergedBlocks, Jumps);
1223 
1224     double GainScore = DistGain + Config.FrequencyScale * FreqGain;
1225     // Scale the result to increase the importance of merging short chains.
1226     if (GainScore >= 0.0)
1227       GainScore /= std::min(ChainPred->Size, ChainSucc->Size);
1228 
1229     return MergeGainT(GainScore, MergeOffset, MergeType);
1230   }
1231 
1232   /// Compute the change of the frequency locality after merging the chains.
1233   double freqBasedLocalityGain(ChainT *ChainPred, ChainT *ChainSucc) const {
1234     auto missProbability = [&](double ChainDensity) {
1235       double PageSamples = ChainDensity * Config.CacheSize;
1236       if (PageSamples >= TotalSamples)
1237         return 0.0;
1238       double P = PageSamples / TotalSamples;
1239       return pow(1.0 - P, static_cast<double>(Config.CacheEntries));
1240     };
1241 
1242     // Cache misses on the chains before merging.
1243     double CurScore =
1244         ChainPred->ExecutionCount * missProbability(ChainPred->density()) +
1245         ChainSucc->ExecutionCount * missProbability(ChainSucc->density());
1246 
1247     // Cache misses on the merged chain
1248     double MergedCounts = ChainPred->ExecutionCount + ChainSucc->ExecutionCount;
1249     double MergedSize = ChainPred->Size + ChainSucc->Size;
1250     double MergedDensity = static_cast<double>(MergedCounts) / MergedSize;
1251     double NewScore = MergedCounts * missProbability(MergedDensity);
1252 
1253     return CurScore - NewScore;
1254   }
1255 
1256   /// Compute the distance locality for a jump / call.
1257   double distScore(uint64_t SrcAddr, uint64_t DstAddr, uint64_t Count) const {
1258     uint64_t Dist = SrcAddr <= DstAddr ? DstAddr - SrcAddr : SrcAddr - DstAddr;
1259     double D = Dist == 0 ? 0.1 : static_cast<double>(Dist);
1260     return static_cast<double>(Count) * std::pow(D, -Config.DistancePower);
1261   }
1262 
1263   /// Compute the change of the distance locality after merging the chains.
1264   double distBasedLocalityGain(const MergedChain &MergedBlocks,
1265                                const std::vector<JumpT *> &Jumps) const {
1266     if (Jumps.empty())
1267       return 0.0;
1268     uint64_t CurAddr = 0;
1269     MergedBlocks.forEach([&](const NodeT *Node) {
1270       Node->EstimatedAddr = CurAddr;
1271       CurAddr += Node->Size;
1272     });
1273 
1274     double CurScore = 0;
1275     double NewScore = 0;
1276     for (const JumpT *Arc : Jumps) {
1277       uint64_t SrcAddr = Arc->Source->EstimatedAddr + Arc->Offset;
1278       uint64_t DstAddr = Arc->Target->EstimatedAddr;
1279       NewScore += distScore(SrcAddr, DstAddr, Arc->ExecutionCount);
1280       CurScore += distScore(0, TotalSize, Arc->ExecutionCount);
1281     }
1282     return NewScore - CurScore;
1283   }
1284 
1285   /// Merge chain From into chain Into, update the list of active chains,
1286   /// adjacency information, and the corresponding cached values.
1287   void mergeChains(ChainT *Into, ChainT *From, size_t MergeOffset,
1288                    MergeTypeT MergeType) {
1289     assert(Into != From && "a chain cannot be merged with itself");
1290 
1291     // Merge the nodes.
1292     MergedChain MergedNodes =
1293         mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType);
1294     Into->merge(From, MergedNodes.getNodes());
1295 
1296     // Merge the edges.
1297     Into->mergeEdges(From);
1298     From->clear();
1299 
1300     // Remove the chain from the list of active chains.
1301     llvm::erase_value(HotChains, From);
1302   }
1303 
1304   /// Concatenate all chains into the final order.
1305   void concatChains(std::vector<uint64_t> &Order) {
1306     // Collect chains and calculate density stats for their sorting.
1307     std::vector<const ChainT *> SortedChains;
1308     DenseMap<const ChainT *, double> ChainDensity;
1309     for (ChainT &Chain : AllChains) {
1310       if (!Chain.Nodes.empty()) {
1311         SortedChains.push_back(&Chain);
1312         // Using doubles to avoid overflow of ExecutionCounts.
1313         double Size = 0;
1314         double ExecutionCount = 0;
1315         for (NodeT *Node : Chain.Nodes) {
1316           Size += static_cast<double>(Node->Size);
1317           ExecutionCount += static_cast<double>(Node->ExecutionCount);
1318         }
1319         assert(Size > 0 && "a chain of zero size");
1320         ChainDensity[&Chain] = ExecutionCount / Size;
1321       }
1322     }
1323 
1324     // Sort chains by density in the decreasing order.
1325     std::sort(SortedChains.begin(), SortedChains.end(),
1326               [&](const ChainT *L, const ChainT *R) {
1327                 const double DL = ChainDensity[L];
1328                 const double DR = ChainDensity[R];
1329                 // Compare by density and break ties by chain identifiers.
1330                 return std::make_tuple(-DL, L->Id) <
1331                        std::make_tuple(-DR, R->Id);
1332               });
1333 
1334     // Collect the nodes in the order specified by their chains.
1335     Order.reserve(NumNodes);
1336     for (const ChainT *Chain : SortedChains)
1337       for (NodeT *Node : Chain->Nodes)
1338         Order.push_back(Node->Index);
1339   }
1340 
1341 private:
1342   /// Config for the algorithm.
1343   const CDSortConfig Config;
1344 
1345   /// The number of nodes in the graph.
1346   const size_t NumNodes;
1347 
1348   /// Successors of each node.
1349   std::vector<std::vector<uint64_t>> SuccNodes;
1350 
1351   /// Predecessors of each node.
1352   std::vector<std::vector<uint64_t>> PredNodes;
1353 
1354   /// All nodes (functions) in the graph.
1355   std::vector<NodeT> AllNodes;
1356 
1357   /// All jumps (function calls) between the nodes.
1358   std::vector<JumpT> AllJumps;
1359 
1360   /// All chains of nodes.
1361   std::vector<ChainT> AllChains;
1362 
1363   /// All edges between the chains.
1364   std::vector<ChainEdge> AllEdges;
1365 
1366   /// Active chains. The vector gets updated at runtime when chains are merged.
1367   std::vector<ChainT *> HotChains;
1368 
1369   /// The total number of samples in the graph.
1370   uint64_t TotalSamples{0};
1371 
1372   /// The total size of the nodes in the graph.
1373   uint64_t TotalSize{0};
1374 };
1375 
1376 } // end of anonymous namespace
1377 
1378 std::vector<uint64_t>
1379 llvm::applyExtTspLayout(const std::vector<uint64_t> &NodeSizes,
1380                         const std::vector<uint64_t> &NodeCounts,
1381                         const std::vector<EdgeCountT> &EdgeCounts) {
1382   // Verify correctness of the input data.
1383   assert(NodeCounts.size() == NodeSizes.size() && "Incorrect input");
1384   assert(NodeSizes.size() > 2 && "Incorrect input");
1385 
1386   // Apply the reordering algorithm.
1387   ExtTSPImpl Alg(NodeSizes, NodeCounts, EdgeCounts);
1388   std::vector<uint64_t> Result;
1389   Alg.run(Result);
1390 
1391   // Verify correctness of the output.
1392   assert(Result.front() == 0 && "Original entry point is not preserved");
1393   assert(Result.size() == NodeSizes.size() && "Incorrect size of layout");
1394   return Result;
1395 }
1396 
1397 double llvm::calcExtTspScore(const std::vector<uint64_t> &Order,
1398                              const std::vector<uint64_t> &NodeSizes,
1399                              const std::vector<uint64_t> &NodeCounts,
1400                              const std::vector<EdgeCountT> &EdgeCounts) {
1401   // Estimate addresses of the blocks in memory.
1402   std::vector<uint64_t> Addr(NodeSizes.size(), 0);
1403   for (size_t Idx = 1; Idx < Order.size(); Idx++) {
1404     Addr[Order[Idx]] = Addr[Order[Idx - 1]] + NodeSizes[Order[Idx - 1]];
1405   }
1406   std::vector<uint64_t> OutDegree(NodeSizes.size(), 0);
1407   for (auto It : EdgeCounts) {
1408     uint64_t Pred = It.first.first;
1409     OutDegree[Pred]++;
1410   }
1411 
1412   // Increase the score for each jump.
1413   double Score = 0;
1414   for (auto It : EdgeCounts) {
1415     uint64_t Pred = It.first.first;
1416     uint64_t Succ = It.first.second;
1417     uint64_t Count = It.second;
1418     bool IsConditional = OutDegree[Pred] > 1;
1419     Score += ::extTSPScore(Addr[Pred], NodeSizes[Pred], Addr[Succ], Count,
1420                            IsConditional);
1421   }
1422   return Score;
1423 }
1424 
1425 double llvm::calcExtTspScore(const std::vector<uint64_t> &NodeSizes,
1426                              const std::vector<uint64_t> &NodeCounts,
1427                              const std::vector<EdgeCountT> &EdgeCounts) {
1428   std::vector<uint64_t> Order(NodeSizes.size());
1429   for (size_t Idx = 0; Idx < NodeSizes.size(); Idx++) {
1430     Order[Idx] = Idx;
1431   }
1432   return calcExtTspScore(Order, NodeSizes, NodeCounts, EdgeCounts);
1433 }
1434 
1435 std::vector<uint64_t>
1436 llvm::applyCDSLayout(const CDSortConfig &Config,
1437                      const std::vector<uint64_t> &FuncSizes,
1438                      const std::vector<uint64_t> &FuncCounts,
1439                      const std::vector<EdgeCountT> &CallCounts,
1440                      const std::vector<uint64_t> &CallOffsets) {
1441   // Verify correctness of the input data.
1442   assert(FuncCounts.size() == FuncSizes.size() && "Incorrect input");
1443 
1444   // Apply the reordering algorithm.
1445   CDSortImpl Alg(Config, FuncSizes, FuncCounts, CallCounts, CallOffsets);
1446   std::vector<uint64_t> Result;
1447   Alg.run(Result);
1448 
1449   // Verify correctness of the output.
1450   assert(Result.size() == FuncSizes.size() && "Incorrect size of layout");
1451   return Result;
1452 }
1453 
1454 std::vector<uint64_t>
1455 llvm::applyCDSLayout(const std::vector<uint64_t> &FuncSizes,
1456                      const std::vector<uint64_t> &FuncCounts,
1457                      const std::vector<EdgeCountT> &CallCounts,
1458                      const std::vector<uint64_t> &CallOffsets) {
1459   CDSortConfig Config;
1460   // Populate the config from the command-line options.
1461   if (CacheEntries.getNumOccurrences() > 0)
1462     Config.CacheEntries = CacheEntries;
1463   if (CacheSize.getNumOccurrences() > 0)
1464     Config.CacheSize = CacheSize;
1465   if (DistancePower.getNumOccurrences() > 0)
1466     Config.DistancePower = DistancePower;
1467   if (FrequencyScale.getNumOccurrences() > 0)
1468     Config.FrequencyScale = FrequencyScale;
1469   return applyCDSLayout(Config, FuncSizes, FuncCounts, CallCounts, CallOffsets);
1470 }
1471