xref: /llvm-project/llvm/lib/Transforms/Utils/CodeLayout.cpp (revision bc59faa86308d184638f6396db79139b569f11fd)
1 //===- CodeLayout.cpp - Implementation of code layout algorithms ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // The file implements "cache-aware" layout algorithms of basic blocks and
10 // functions in a binary.
11 //
12 // The algorithm tries to find a layout of nodes (basic blocks) of a given CFG
13 // optimizing jump locality and thus processor I-cache utilization. This is
14 // achieved via increasing the number of fall-through jumps and co-locating
15 // frequently executed nodes together. The name follows the underlying
16 // optimization problem, Extended-TSP, which is a generalization of classical
17 // (maximum) Traveling Salesmen Problem.
18 //
19 // The algorithm is a greedy heuristic that works with chains (ordered lists)
20 // of basic blocks. Initially all chains are isolated basic blocks. On every
21 // iteration, we pick a pair of chains whose merging yields the biggest increase
22 // in the ExtTSP score, which models how i-cache "friendly" a specific chain is.
23 // A pair of chains giving the maximum gain is merged into a new chain. The
24 // procedure stops when there is only one chain left, or when merging does not
25 // increase ExtTSP. In the latter case, the remaining chains are sorted by
26 // density in the decreasing order.
27 //
28 // An important aspect is the way two chains are merged. Unlike earlier
29 // algorithms (e.g., based on the approach of Pettis-Hansen), two
30 // chains, X and Y, are first split into three, X1, X2, and Y. Then we
31 // consider all possible ways of gluing the three chains (e.g., X1YX2, X1X2Y,
32 // X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the largest score.
33 // This improves the quality of the final result (the search space is larger)
34 // while keeping the implementation sufficiently fast.
35 //
36 // Reference:
37 //   * A. Newell and S. Pupyrev, Improved Basic Block Reordering,
38 //     IEEE Transactions on Computers, 2020
39 //     https://arxiv.org/abs/1809.04676
40 //
41 //===----------------------------------------------------------------------===//
42 
43 #include "llvm/Transforms/Utils/CodeLayout.h"
44 #include "llvm/Support/CommandLine.h"
45 #include "llvm/Support/Debug.h"
46 
47 #include <cmath>
48 #include <set>
49 
50 using namespace llvm;
51 #define DEBUG_TYPE "code-layout"
52 
53 namespace llvm {
54 cl::opt<bool> EnableExtTspBlockPlacement(
55     "enable-ext-tsp-block-placement", cl::Hidden, cl::init(false),
56     cl::desc("Enable machine block placement based on the ext-tsp model, "
57              "optimizing I-cache utilization."));
58 
59 cl::opt<bool> ApplyExtTspWithoutProfile(
60     "ext-tsp-apply-without-profile",
61     cl::desc("Whether to apply ext-tsp placement for instances w/o profile"),
62     cl::init(true), cl::Hidden);
63 } // namespace llvm
64 
65 // Algorithm-specific params for Ext-TSP. The values are tuned for the best
66 // performance of large-scale front-end bound binaries.
67 static cl::opt<double> ForwardWeightCond(
68     "ext-tsp-forward-weight-cond", cl::ReallyHidden, cl::init(0.1),
69     cl::desc("The weight of conditional forward jumps for ExtTSP value"));
70 
71 static cl::opt<double> ForwardWeightUncond(
72     "ext-tsp-forward-weight-uncond", cl::ReallyHidden, cl::init(0.1),
73     cl::desc("The weight of unconditional forward jumps for ExtTSP value"));
74 
75 static cl::opt<double> BackwardWeightCond(
76     "ext-tsp-backward-weight-cond", cl::ReallyHidden, cl::init(0.1),
77     cl::desc("The weight of conditional backward jumps for ExtTSP value"));
78 
79 static cl::opt<double> BackwardWeightUncond(
80     "ext-tsp-backward-weight-uncond", cl::ReallyHidden, cl::init(0.1),
81     cl::desc("The weight of unconditional backward jumps for ExtTSP value"));
82 
83 static cl::opt<double> FallthroughWeightCond(
84     "ext-tsp-fallthrough-weight-cond", cl::ReallyHidden, cl::init(1.0),
85     cl::desc("The weight of conditional fallthrough jumps for ExtTSP value"));
86 
87 static cl::opt<double> FallthroughWeightUncond(
88     "ext-tsp-fallthrough-weight-uncond", cl::ReallyHidden, cl::init(1.05),
89     cl::desc("The weight of unconditional fallthrough jumps for ExtTSP value"));
90 
91 static cl::opt<unsigned> ForwardDistance(
92     "ext-tsp-forward-distance", cl::ReallyHidden, cl::init(1024),
93     cl::desc("The maximum distance (in bytes) of a forward jump for ExtTSP"));
94 
95 static cl::opt<unsigned> BackwardDistance(
96     "ext-tsp-backward-distance", cl::ReallyHidden, cl::init(640),
97     cl::desc("The maximum distance (in bytes) of a backward jump for ExtTSP"));
98 
99 // The maximum size of a chain created by the algorithm. The size is bounded
100 // so that the algorithm can efficiently process extremely large instance.
101 static cl::opt<unsigned>
102     MaxChainSize("ext-tsp-max-chain-size", cl::ReallyHidden, cl::init(4096),
103                  cl::desc("The maximum size of a chain to create."));
104 
105 // The maximum size of a chain for splitting. Larger values of the threshold
106 // may yield better quality at the cost of worsen run-time.
107 static cl::opt<unsigned> ChainSplitThreshold(
108     "ext-tsp-chain-split-threshold", cl::ReallyHidden, cl::init(128),
109     cl::desc("The maximum size of a chain to apply splitting"));
110 
111 // The option enables splitting (large) chains along in-coming and out-going
112 // jumps. This typically results in a better quality.
113 static cl::opt<bool> EnableChainSplitAlongJumps(
114     "ext-tsp-enable-chain-split-along-jumps", cl::ReallyHidden, cl::init(true),
115     cl::desc("The maximum size of a chain to apply splitting"));
116 
117 // Algorithm-specific options for CDS.
118 static cl::opt<unsigned> CacheEntries("cds-cache-entries", cl::ReallyHidden,
119                                       cl::desc("The size of the cache"));
120 
121 static cl::opt<unsigned> CacheSize("cds-cache-size", cl::ReallyHidden,
122                                    cl::desc("The size of a line in the cache"));
123 
124 static cl::opt<double> DistancePower(
125     "cds-distance-power", cl::ReallyHidden,
126     cl::desc("The power exponent for the distance-based locality"));
127 
128 static cl::opt<double> FrequencyScale(
129     "cds-frequency-scale", cl::ReallyHidden,
130     cl::desc("The scale factor for the frequency-based locality"));
131 
132 namespace {
133 
134 // Epsilon for comparison of doubles.
135 constexpr double EPS = 1e-8;
136 
137 // Compute the Ext-TSP score for a given jump.
138 double jumpExtTSPScore(uint64_t JumpDist, uint64_t JumpMaxDist, uint64_t Count,
139                        double Weight) {
140   if (JumpDist > JumpMaxDist)
141     return 0;
142   double Prob = 1.0 - static_cast<double>(JumpDist) / JumpMaxDist;
143   return Weight * Prob * Count;
144 }
145 
146 // Compute the Ext-TSP score for a jump between a given pair of blocks,
147 // using their sizes, (estimated) addresses and the jump execution count.
148 double extTSPScore(uint64_t SrcAddr, uint64_t SrcSize, uint64_t DstAddr,
149                    uint64_t Count, bool IsConditional) {
150   // Fallthrough
151   if (SrcAddr + SrcSize == DstAddr) {
152     return jumpExtTSPScore(0, 1, Count,
153                            IsConditional ? FallthroughWeightCond
154                                          : FallthroughWeightUncond);
155   }
156   // Forward
157   if (SrcAddr + SrcSize < DstAddr) {
158     const uint64_t Dist = DstAddr - (SrcAddr + SrcSize);
159     return jumpExtTSPScore(Dist, ForwardDistance, Count,
160                            IsConditional ? ForwardWeightCond
161                                          : ForwardWeightUncond);
162   }
163   // Backward
164   const uint64_t Dist = SrcAddr + SrcSize - DstAddr;
165   return jumpExtTSPScore(Dist, BackwardDistance, Count,
166                          IsConditional ? BackwardWeightCond
167                                        : BackwardWeightUncond);
168 }
169 
170 /// A type of merging two chains, X and Y. The former chain is split into
171 /// X1 and X2 and then concatenated with Y in the order specified by the type.
172 enum class MergeTypeT : int { X_Y, Y_X, X1_Y_X2, Y_X2_X1, X2_X1_Y };
173 
174 /// The gain of merging two chains, that is, the Ext-TSP score of the merge
175 /// together with the corresponding merge 'type' and 'offset'.
176 struct MergeGainT {
177   explicit MergeGainT() = default;
178   explicit MergeGainT(double Score, size_t MergeOffset, MergeTypeT MergeType)
179       : Score(Score), MergeOffset(MergeOffset), MergeType(MergeType) {}
180 
181   double score() const { return Score; }
182 
183   size_t mergeOffset() const { return MergeOffset; }
184 
185   MergeTypeT mergeType() const { return MergeType; }
186 
187   void setMergeType(MergeTypeT Ty) { MergeType = Ty; }
188 
189   // Returns 'true' iff Other is preferred over this.
190   bool operator<(const MergeGainT &Other) const {
191     return (Other.Score > EPS && Other.Score > Score + EPS);
192   }
193 
194   // Update the current gain if Other is preferred over this.
195   void updateIfLessThan(const MergeGainT &Other) {
196     if (*this < Other)
197       *this = Other;
198   }
199 
200 private:
201   double Score{-1.0};
202   size_t MergeOffset{0};
203   MergeTypeT MergeType{MergeTypeT::X_Y};
204 };
205 
206 struct JumpT;
207 struct ChainT;
208 struct ChainEdge;
209 
210 /// A node in the graph, typically corresponding to a basic block in the CFG or
211 /// a function in the call graph.
212 struct NodeT {
213   NodeT(const NodeT &) = delete;
214   NodeT(NodeT &&) = default;
215   NodeT &operator=(const NodeT &) = delete;
216   NodeT &operator=(NodeT &&) = default;
217 
218   explicit NodeT(size_t Index, uint64_t Size, uint64_t EC)
219       : Index(Index), Size(Size), ExecutionCount(EC) {}
220 
221   bool isEntry() const { return Index == 0; }
222 
223   // The total execution count of outgoing jumps.
224   uint64_t outCount() const;
225 
226   // The total execution count of incoming jumps.
227   uint64_t inCount() const;
228 
229   // The original index of the node in graph.
230   size_t Index{0};
231   // The index of the node in the current chain.
232   size_t CurIndex{0};
233   // The size of the node in the binary.
234   uint64_t Size{0};
235   // The execution count of the node in the profile data.
236   uint64_t ExecutionCount{0};
237   // The current chain of the node.
238   ChainT *CurChain{nullptr};
239   // The offset of the node in the current chain.
240   mutable uint64_t EstimatedAddr{0};
241   // Forced successor of the node in the graph.
242   NodeT *ForcedSucc{nullptr};
243   // Forced predecessor of the node in the graph.
244   NodeT *ForcedPred{nullptr};
245   // Outgoing jumps from the node.
246   std::vector<JumpT *> OutJumps;
247   // Incoming jumps to the node.
248   std::vector<JumpT *> InJumps;
249 };
250 
251 /// An arc in the graph, typically corresponding to a jump between two nodes.
252 struct JumpT {
253   JumpT(const JumpT &) = delete;
254   JumpT(JumpT &&) = default;
255   JumpT &operator=(const JumpT &) = delete;
256   JumpT &operator=(JumpT &&) = default;
257 
258   explicit JumpT(NodeT *Source, NodeT *Target, uint64_t ExecutionCount)
259       : Source(Source), Target(Target), ExecutionCount(ExecutionCount) {}
260 
261   // Source node of the jump.
262   NodeT *Source;
263   // Target node of the jump.
264   NodeT *Target;
265   // Execution count of the arc in the profile data.
266   uint64_t ExecutionCount{0};
267   // Whether the jump corresponds to a conditional branch.
268   bool IsConditional{false};
269   // The offset of the jump from the source node.
270   uint64_t Offset{0};
271 };
272 
273 /// A chain (ordered sequence) of nodes in the graph.
274 struct ChainT {
275   ChainT(const ChainT &) = delete;
276   ChainT(ChainT &&) = default;
277   ChainT &operator=(const ChainT &) = delete;
278   ChainT &operator=(ChainT &&) = default;
279 
280   explicit ChainT(uint64_t Id, NodeT *Node)
281       : Id(Id), ExecutionCount(Node->ExecutionCount), Size(Node->Size),
282         Nodes(1, Node) {}
283 
284   size_t numBlocks() const { return Nodes.size(); }
285 
286   double density() const { return static_cast<double>(ExecutionCount) / Size; }
287 
288   bool isEntry() const { return Nodes[0]->Index == 0; }
289 
290   bool isCold() const {
291     for (NodeT *Node : Nodes) {
292       if (Node->ExecutionCount > 0)
293         return false;
294     }
295     return true;
296   }
297 
298   ChainEdge *getEdge(ChainT *Other) const {
299     for (const auto &[Chain, ChainEdge] : Edges) {
300       if (Chain == Other)
301         return ChainEdge;
302     }
303     return nullptr;
304   }
305 
306   void removeEdge(ChainT *Other) {
307     auto It = Edges.begin();
308     while (It != Edges.end()) {
309       if (It->first == Other) {
310         Edges.erase(It);
311         return;
312       }
313       It++;
314     }
315   }
316 
317   void addEdge(ChainT *Other, ChainEdge *Edge) {
318     Edges.push_back(std::make_pair(Other, Edge));
319   }
320 
321   void merge(ChainT *Other, const std::vector<NodeT *> &MergedBlocks) {
322     Nodes = MergedBlocks;
323     // Update the chain's data.
324     ExecutionCount += Other->ExecutionCount;
325     Size += Other->Size;
326     Id = Nodes[0]->Index;
327     // Update the node's data.
328     for (size_t Idx = 0; Idx < Nodes.size(); Idx++) {
329       Nodes[Idx]->CurChain = this;
330       Nodes[Idx]->CurIndex = Idx;
331     }
332   }
333 
334   void mergeEdges(ChainT *Other);
335 
336   void clear() {
337     Nodes.clear();
338     Nodes.shrink_to_fit();
339     Edges.clear();
340     Edges.shrink_to_fit();
341   }
342 
343   // Unique chain identifier.
344   uint64_t Id;
345   // Cached ext-tsp score for the chain.
346   double Score{0};
347   // The total execution count of the chain.
348   uint64_t ExecutionCount{0};
349   // The total size of the chain.
350   uint64_t Size{0};
351   // Nodes of the chain.
352   std::vector<NodeT *> Nodes;
353   // Adjacent chains and corresponding edges (lists of jumps).
354   std::vector<std::pair<ChainT *, ChainEdge *>> Edges;
355 };
356 
357 /// An edge in the graph representing jumps between two chains.
358 /// When nodes are merged into chains, the edges are combined too so that
359 /// there is always at most one edge between a pair of chains.
360 struct ChainEdge {
361   ChainEdge(const ChainEdge &) = delete;
362   ChainEdge(ChainEdge &&) = default;
363   ChainEdge &operator=(const ChainEdge &) = delete;
364   ChainEdge &operator=(ChainEdge &&) = delete;
365 
366   explicit ChainEdge(JumpT *Jump)
367       : SrcChain(Jump->Source->CurChain), DstChain(Jump->Target->CurChain),
368         Jumps(1, Jump) {}
369 
370   ChainT *srcChain() const { return SrcChain; }
371 
372   ChainT *dstChain() const { return DstChain; }
373 
374   bool isSelfEdge() const { return SrcChain == DstChain; }
375 
376   const std::vector<JumpT *> &jumps() const { return Jumps; }
377 
378   void appendJump(JumpT *Jump) { Jumps.push_back(Jump); }
379 
380   void moveJumps(ChainEdge *Other) {
381     Jumps.insert(Jumps.end(), Other->Jumps.begin(), Other->Jumps.end());
382     Other->Jumps.clear();
383     Other->Jumps.shrink_to_fit();
384   }
385 
386   void changeEndpoint(ChainT *From, ChainT *To) {
387     if (From == SrcChain)
388       SrcChain = To;
389     if (From == DstChain)
390       DstChain = To;
391   }
392 
393   bool hasCachedMergeGain(ChainT *Src, ChainT *Dst) const {
394     return Src == SrcChain ? CacheValidForward : CacheValidBackward;
395   }
396 
397   MergeGainT getCachedMergeGain(ChainT *Src, ChainT *Dst) const {
398     return Src == SrcChain ? CachedGainForward : CachedGainBackward;
399   }
400 
401   void setCachedMergeGain(ChainT *Src, ChainT *Dst, MergeGainT MergeGain) {
402     if (Src == SrcChain) {
403       CachedGainForward = MergeGain;
404       CacheValidForward = true;
405     } else {
406       CachedGainBackward = MergeGain;
407       CacheValidBackward = true;
408     }
409   }
410 
411   void invalidateCache() {
412     CacheValidForward = false;
413     CacheValidBackward = false;
414   }
415 
416   void setMergeGain(MergeGainT Gain) { CachedGain = Gain; }
417 
418   MergeGainT getMergeGain() const { return CachedGain; }
419 
420   double gain() const { return CachedGain.score(); }
421 
422 private:
423   // Source chain.
424   ChainT *SrcChain{nullptr};
425   // Destination chain.
426   ChainT *DstChain{nullptr};
427   // Original jumps in the binary with corresponding execution counts.
428   std::vector<JumpT *> Jumps;
429   // Cached gain value for merging the pair of chains.
430   MergeGainT CachedGain;
431 
432   // Cached gain values for merging the pair of chains. Since the gain of
433   // merging (Src, Dst) and (Dst, Src) might be different, we store both values
434   // here and a flag indicating which of the options results in a higher gain.
435   // Cached gain values.
436   MergeGainT CachedGainForward;
437   MergeGainT CachedGainBackward;
438   // Whether the cached value must be recomputed.
439   bool CacheValidForward{false};
440   bool CacheValidBackward{false};
441 };
442 
443 uint64_t NodeT::outCount() const {
444   uint64_t Count = 0;
445   for (JumpT *Jump : OutJumps)
446     Count += Jump->ExecutionCount;
447   return Count;
448 }
449 
450 uint64_t NodeT::inCount() const {
451   uint64_t Count = 0;
452   for (JumpT *Jump : InJumps)
453     Count += Jump->ExecutionCount;
454   return Count;
455 }
456 
457 void ChainT::mergeEdges(ChainT *Other) {
458   // Update edges adjacent to chain Other.
459   for (const auto &[DstChain, DstEdge] : Other->Edges) {
460     ChainT *TargetChain = DstChain == Other ? this : DstChain;
461     ChainEdge *CurEdge = getEdge(TargetChain);
462     if (CurEdge == nullptr) {
463       DstEdge->changeEndpoint(Other, this);
464       this->addEdge(TargetChain, DstEdge);
465       if (DstChain != this && DstChain != Other)
466         DstChain->addEdge(this, DstEdge);
467     } else {
468       CurEdge->moveJumps(DstEdge);
469     }
470     // Cleanup leftover edge.
471     if (DstChain != Other)
472       DstChain->removeEdge(Other);
473   }
474 }
475 
476 using NodeIter = std::vector<NodeT *>::const_iterator;
477 
478 /// A wrapper around three chains of nodes; it is used to avoid extra
479 /// instantiation of the vectors.
480 struct MergedChain {
481   MergedChain(NodeIter Begin1, NodeIter End1, NodeIter Begin2 = NodeIter(),
482               NodeIter End2 = NodeIter(), NodeIter Begin3 = NodeIter(),
483               NodeIter End3 = NodeIter())
484       : Begin1(Begin1), End1(End1), Begin2(Begin2), End2(End2), Begin3(Begin3),
485         End3(End3) {}
486 
487   template <typename F> void forEach(const F &Func) const {
488     for (auto It = Begin1; It != End1; It++)
489       Func(*It);
490     for (auto It = Begin2; It != End2; It++)
491       Func(*It);
492     for (auto It = Begin3; It != End3; It++)
493       Func(*It);
494   }
495 
496   std::vector<NodeT *> getNodes() const {
497     std::vector<NodeT *> Result;
498     Result.reserve(std::distance(Begin1, End1) + std::distance(Begin2, End2) +
499                    std::distance(Begin3, End3));
500     Result.insert(Result.end(), Begin1, End1);
501     Result.insert(Result.end(), Begin2, End2);
502     Result.insert(Result.end(), Begin3, End3);
503     return Result;
504   }
505 
506   const NodeT *getFirstNode() const { return *Begin1; }
507 
508 private:
509   NodeIter Begin1;
510   NodeIter End1;
511   NodeIter Begin2;
512   NodeIter End2;
513   NodeIter Begin3;
514   NodeIter End3;
515 };
516 
517 /// Merge two chains of nodes respecting a given 'type' and 'offset'.
518 ///
519 /// If MergeType == 0, then the result is a concatenation of two chains.
520 /// Otherwise, the first chain is cut into two sub-chains at the offset,
521 /// and merged using all possible ways of concatenating three chains.
522 MergedChain mergeNodes(const std::vector<NodeT *> &X,
523                        const std::vector<NodeT *> &Y, size_t MergeOffset,
524                        MergeTypeT MergeType) {
525   // Split the first chain, X, into X1 and X2.
526   NodeIter BeginX1 = X.begin();
527   NodeIter EndX1 = X.begin() + MergeOffset;
528   NodeIter BeginX2 = X.begin() + MergeOffset;
529   NodeIter EndX2 = X.end();
530   NodeIter BeginY = Y.begin();
531   NodeIter EndY = Y.end();
532 
533   // Construct a new chain from the three existing ones.
534   switch (MergeType) {
535   case MergeTypeT::X_Y:
536     return MergedChain(BeginX1, EndX2, BeginY, EndY);
537   case MergeTypeT::Y_X:
538     return MergedChain(BeginY, EndY, BeginX1, EndX2);
539   case MergeTypeT::X1_Y_X2:
540     return MergedChain(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2);
541   case MergeTypeT::Y_X2_X1:
542     return MergedChain(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1);
543   case MergeTypeT::X2_X1_Y:
544     return MergedChain(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY);
545   }
546   llvm_unreachable("unexpected chain merge type");
547 }
548 
549 /// The implementation of the ExtTSP algorithm.
550 class ExtTSPImpl {
551 public:
552   ExtTSPImpl(const std::vector<uint64_t> &NodeSizes,
553              const std::vector<uint64_t> &NodeCounts,
554              const std::vector<EdgeCountT> &EdgeCounts)
555       : NumNodes(NodeSizes.size()) {
556     initialize(NodeSizes, NodeCounts, EdgeCounts);
557   }
558 
559   /// Run the algorithm and return an optimized ordering of nodes.
560   void run(std::vector<uint64_t> &Result) {
561     // Pass 1: Merge nodes with their mutually forced successors
562     mergeForcedPairs();
563 
564     // Pass 2: Merge pairs of chains while improving the ExtTSP objective
565     mergeChainPairs();
566 
567     // Pass 3: Merge cold nodes to reduce code size
568     mergeColdChains();
569 
570     // Collect nodes from all chains
571     concatChains(Result);
572   }
573 
574 private:
575   /// Initialize the algorithm's data structures.
576   void initialize(const std::vector<uint64_t> &NodeSizes,
577                   const std::vector<uint64_t> &NodeCounts,
578                   const std::vector<EdgeCountT> &EdgeCounts) {
579     // Initialize nodes
580     AllNodes.reserve(NumNodes);
581     for (uint64_t Idx = 0; Idx < NumNodes; Idx++) {
582       uint64_t Size = std::max<uint64_t>(NodeSizes[Idx], 1ULL);
583       uint64_t ExecutionCount = NodeCounts[Idx];
584       // The execution count of the entry node is set to at least one.
585       if (Idx == 0 && ExecutionCount == 0)
586         ExecutionCount = 1;
587       AllNodes.emplace_back(Idx, Size, ExecutionCount);
588     }
589 
590     // Initialize jumps between nodes
591     SuccNodes.resize(NumNodes);
592     PredNodes.resize(NumNodes);
593     std::vector<uint64_t> OutDegree(NumNodes, 0);
594     AllJumps.reserve(EdgeCounts.size());
595     for (auto It : EdgeCounts) {
596       uint64_t Pred = It.first.first;
597       uint64_t Succ = It.first.second;
598       OutDegree[Pred]++;
599       // Ignore self-edges.
600       if (Pred == Succ)
601         continue;
602 
603       SuccNodes[Pred].push_back(Succ);
604       PredNodes[Succ].push_back(Pred);
605       uint64_t ExecutionCount = It.second;
606       if (ExecutionCount > 0) {
607         NodeT &PredNode = AllNodes[Pred];
608         NodeT &SuccNode = AllNodes[Succ];
609         AllJumps.emplace_back(&PredNode, &SuccNode, ExecutionCount);
610         SuccNode.InJumps.push_back(&AllJumps.back());
611         PredNode.OutJumps.push_back(&AllJumps.back());
612       }
613     }
614     for (JumpT &Jump : AllJumps) {
615       assert(OutDegree[Jump.Source->Index] > 0);
616       Jump.IsConditional = OutDegree[Jump.Source->Index] > 1;
617     }
618 
619     // Initialize chains.
620     AllChains.reserve(NumNodes);
621     HotChains.reserve(NumNodes);
622     for (NodeT &Node : AllNodes) {
623       AllChains.emplace_back(Node.Index, &Node);
624       Node.CurChain = &AllChains.back();
625       if (Node.ExecutionCount > 0)
626         HotChains.push_back(&AllChains.back());
627     }
628 
629     // Initialize chain edges.
630     AllEdges.reserve(AllJumps.size());
631     for (NodeT &PredNode : AllNodes) {
632       for (JumpT *Jump : PredNode.OutJumps) {
633         NodeT *SuccNode = Jump->Target;
634         ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain);
635         // this edge is already present in the graph.
636         if (CurEdge != nullptr) {
637           assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr);
638           CurEdge->appendJump(Jump);
639           continue;
640         }
641         // this is a new edge.
642         AllEdges.emplace_back(Jump);
643         PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back());
644         SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back());
645       }
646     }
647   }
648 
649   /// For a pair of nodes, A and B, node B is the forced successor of A,
650   /// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps
651   /// to B are from A. Such nodes should be adjacent in the optimal ordering;
652   /// the method finds and merges such pairs of nodes.
653   void mergeForcedPairs() {
654     // Find fallthroughs based on edge weights.
655     for (NodeT &Node : AllNodes) {
656       if (SuccNodes[Node.Index].size() == 1 &&
657           PredNodes[SuccNodes[Node.Index][0]].size() == 1 &&
658           SuccNodes[Node.Index][0] != 0) {
659         size_t SuccIndex = SuccNodes[Node.Index][0];
660         Node.ForcedSucc = &AllNodes[SuccIndex];
661         AllNodes[SuccIndex].ForcedPred = &Node;
662       }
663     }
664 
665     // There might be 'cycles' in the forced dependencies, since profile
666     // data isn't 100% accurate. Typically this is observed in loops, when the
667     // loop edges are the hottest successors for the basic blocks of the loop.
668     // Break the cycles by choosing the node with the smallest index as the
669     // head. This helps to keep the original order of the loops, which likely
670     // have already been rotated in the optimized manner.
671     for (NodeT &Node : AllNodes) {
672       if (Node.ForcedSucc == nullptr || Node.ForcedPred == nullptr)
673         continue;
674 
675       NodeT *SuccNode = Node.ForcedSucc;
676       while (SuccNode != nullptr && SuccNode != &Node) {
677         SuccNode = SuccNode->ForcedSucc;
678       }
679       if (SuccNode == nullptr)
680         continue;
681       // Break the cycle.
682       AllNodes[Node.ForcedPred->Index].ForcedSucc = nullptr;
683       Node.ForcedPred = nullptr;
684     }
685 
686     // Merge nodes with their fallthrough successors.
687     for (NodeT &Node : AllNodes) {
688       if (Node.ForcedPred == nullptr && Node.ForcedSucc != nullptr) {
689         const NodeT *CurBlock = &Node;
690         while (CurBlock->ForcedSucc != nullptr) {
691           const NodeT *NextBlock = CurBlock->ForcedSucc;
692           mergeChains(Node.CurChain, NextBlock->CurChain, 0, MergeTypeT::X_Y);
693           CurBlock = NextBlock;
694         }
695       }
696     }
697   }
698 
699   /// Merge pairs of chains while improving the ExtTSP objective.
700   void mergeChainPairs() {
701     /// Deterministically compare pairs of chains.
702     auto compareChainPairs = [](const ChainT *A1, const ChainT *B1,
703                                 const ChainT *A2, const ChainT *B2) {
704       if (A1 != A2)
705         return A1->Id < A2->Id;
706       return B1->Id < B2->Id;
707     };
708 
709     while (HotChains.size() > 1) {
710       ChainT *BestChainPred = nullptr;
711       ChainT *BestChainSucc = nullptr;
712       MergeGainT BestGain;
713       // Iterate over all pairs of chains.
714       for (ChainT *ChainPred : HotChains) {
715         // Get candidates for merging with the current chain.
716         for (const auto &[ChainSucc, Edge] : ChainPred->Edges) {
717           // Ignore loop edges.
718           if (ChainPred == ChainSucc)
719             continue;
720 
721           // Stop early if the combined chain violates the maximum allowed size.
722           if (ChainPred->numBlocks() + ChainSucc->numBlocks() >= MaxChainSize)
723             continue;
724 
725           // Compute the gain of merging the two chains.
726           MergeGainT CurGain = getBestMergeGain(ChainPred, ChainSucc, Edge);
727           if (CurGain.score() <= EPS)
728             continue;
729 
730           if (BestGain < CurGain ||
731               (std::abs(CurGain.score() - BestGain.score()) < EPS &&
732                compareChainPairs(ChainPred, ChainSucc, BestChainPred,
733                                  BestChainSucc))) {
734             BestGain = CurGain;
735             BestChainPred = ChainPred;
736             BestChainSucc = ChainSucc;
737           }
738         }
739       }
740 
741       // Stop merging when there is no improvement.
742       if (BestGain.score() <= EPS)
743         break;
744 
745       // Merge the best pair of chains.
746       mergeChains(BestChainPred, BestChainSucc, BestGain.mergeOffset(),
747                   BestGain.mergeType());
748     }
749   }
750 
751   /// Merge remaining nodes into chains w/o taking jump counts into
752   /// consideration. This allows to maintain the original node order in the
753   /// absence of profile data.
754   void mergeColdChains() {
755     for (size_t SrcBB = 0; SrcBB < NumNodes; SrcBB++) {
756       // Iterating in reverse order to make sure original fallthrough jumps are
757       // merged first; this might be beneficial for code size.
758       size_t NumSuccs = SuccNodes[SrcBB].size();
759       for (size_t Idx = 0; Idx < NumSuccs; Idx++) {
760         size_t DstBB = SuccNodes[SrcBB][NumSuccs - Idx - 1];
761         ChainT *SrcChain = AllNodes[SrcBB].CurChain;
762         ChainT *DstChain = AllNodes[DstBB].CurChain;
763         if (SrcChain != DstChain && !DstChain->isEntry() &&
764             SrcChain->Nodes.back()->Index == SrcBB &&
765             DstChain->Nodes.front()->Index == DstBB &&
766             SrcChain->isCold() == DstChain->isCold()) {
767           mergeChains(SrcChain, DstChain, 0, MergeTypeT::X_Y);
768         }
769       }
770     }
771   }
772 
773   /// Compute the Ext-TSP score for a given node order and a list of jumps.
774   double extTSPScore(const MergedChain &MergedBlocks,
775                      const std::vector<JumpT *> &Jumps) const {
776     if (Jumps.empty())
777       return 0.0;
778     uint64_t CurAddr = 0;
779     MergedBlocks.forEach([&](const NodeT *Node) {
780       Node->EstimatedAddr = CurAddr;
781       CurAddr += Node->Size;
782     });
783 
784     double Score = 0;
785     for (JumpT *Jump : Jumps) {
786       const NodeT *SrcBlock = Jump->Source;
787       const NodeT *DstBlock = Jump->Target;
788       Score += ::extTSPScore(SrcBlock->EstimatedAddr, SrcBlock->Size,
789                              DstBlock->EstimatedAddr, Jump->ExecutionCount,
790                              Jump->IsConditional);
791     }
792     return Score;
793   }
794 
795   /// Compute the gain of merging two chains.
796   ///
797   /// The function considers all possible ways of merging two chains and
798   /// computes the one having the largest increase in ExtTSP objective. The
799   /// result is a pair with the first element being the gain and the second
800   /// element being the corresponding merging type.
801   MergeGainT getBestMergeGain(ChainT *ChainPred, ChainT *ChainSucc,
802                               ChainEdge *Edge) const {
803     if (Edge->hasCachedMergeGain(ChainPred, ChainSucc)) {
804       return Edge->getCachedMergeGain(ChainPred, ChainSucc);
805     }
806 
807     // Precompute jumps between ChainPred and ChainSucc.
808     auto Jumps = Edge->jumps();
809     ChainEdge *EdgePP = ChainPred->getEdge(ChainPred);
810     if (EdgePP != nullptr) {
811       Jumps.insert(Jumps.end(), EdgePP->jumps().begin(), EdgePP->jumps().end());
812     }
813     assert(!Jumps.empty() && "trying to merge chains w/o jumps");
814 
815     // This object holds the best chosen gain of merging two chains.
816     MergeGainT Gain = MergeGainT();
817 
818     /// Given a merge offset and a list of merge types, try to merge two chains
819     /// and update Gain with a better alternative.
820     auto tryChainMerging = [&](size_t Offset,
821                                const std::vector<MergeTypeT> &MergeTypes) {
822       // Skip merging corresponding to concatenation w/o splitting.
823       if (Offset == 0 || Offset == ChainPred->Nodes.size())
824         return;
825       // Skip merging if it breaks Forced successors.
826       NodeT *Node = ChainPred->Nodes[Offset - 1];
827       if (Node->ForcedSucc != nullptr)
828         return;
829       // Apply the merge, compute the corresponding gain, and update the best
830       // value, if the merge is beneficial.
831       for (const MergeTypeT &MergeType : MergeTypes) {
832         Gain.updateIfLessThan(
833             computeMergeGain(ChainPred, ChainSucc, Jumps, Offset, MergeType));
834       }
835     };
836 
837     // Try to concatenate two chains w/o splitting.
838     Gain.updateIfLessThan(
839         computeMergeGain(ChainPred, ChainSucc, Jumps, 0, MergeTypeT::X_Y));
840 
841     if (EnableChainSplitAlongJumps) {
842       // Attach (a part of) ChainPred before the first node of ChainSucc.
843       for (JumpT *Jump : ChainSucc->Nodes.front()->InJumps) {
844         const NodeT *SrcBlock = Jump->Source;
845         if (SrcBlock->CurChain != ChainPred)
846           continue;
847         size_t Offset = SrcBlock->CurIndex + 1;
848         tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::X2_X1_Y});
849       }
850 
851       // Attach (a part of) ChainPred after the last node of ChainSucc.
852       for (JumpT *Jump : ChainSucc->Nodes.back()->OutJumps) {
853         const NodeT *DstBlock = Jump->Source;
854         if (DstBlock->CurChain != ChainPred)
855           continue;
856         size_t Offset = DstBlock->CurIndex;
857         tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1});
858       }
859     }
860 
861     // Try to break ChainPred in various ways and concatenate with ChainSucc.
862     if (ChainPred->Nodes.size() <= ChainSplitThreshold) {
863       for (size_t Offset = 1; Offset < ChainPred->Nodes.size(); Offset++) {
864         // Try to split the chain in different ways. In practice, applying
865         // X2_Y_X1 merging is almost never provides benefits; thus, we exclude
866         // it from consideration to reduce the search space.
867         tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1,
868                                  MergeTypeT::X2_X1_Y});
869       }
870     }
871     Edge->setCachedMergeGain(ChainPred, ChainSucc, Gain);
872     return Gain;
873   }
874 
875   /// Compute the score gain of merging two chains, respecting a given
876   /// merge 'type' and 'offset'.
877   ///
878   /// The two chains are not modified in the method.
879   MergeGainT computeMergeGain(const ChainT *ChainPred, const ChainT *ChainSucc,
880                               const std::vector<JumpT *> &Jumps,
881                               size_t MergeOffset, MergeTypeT MergeType) const {
882     auto MergedBlocks =
883         mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType);
884 
885     // Do not allow a merge that does not preserve the original entry point.
886     if ((ChainPred->isEntry() || ChainSucc->isEntry()) &&
887         !MergedBlocks.getFirstNode()->isEntry())
888       return MergeGainT();
889 
890     // The gain for the new chain.
891     auto NewGainScore = extTSPScore(MergedBlocks, Jumps) - ChainPred->Score;
892     return MergeGainT(NewGainScore, MergeOffset, MergeType);
893   }
894 
895   /// Merge chain From into chain Into, update the list of active chains,
896   /// adjacency information, and the corresponding cached values.
897   void mergeChains(ChainT *Into, ChainT *From, size_t MergeOffset,
898                    MergeTypeT MergeType) {
899     assert(Into != From && "a chain cannot be merged with itself");
900 
901     // Merge the nodes.
902     MergedChain MergedNodes =
903         mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType);
904     Into->merge(From, MergedNodes.getNodes());
905 
906     // Merge the edges.
907     Into->mergeEdges(From);
908     From->clear();
909 
910     // Update cached ext-tsp score for the new chain.
911     ChainEdge *SelfEdge = Into->getEdge(Into);
912     if (SelfEdge != nullptr) {
913       MergedNodes = MergedChain(Into->Nodes.begin(), Into->Nodes.end());
914       Into->Score = extTSPScore(MergedNodes, SelfEdge->jumps());
915     }
916 
917     // Remove the chain from the list of active chains.
918     llvm::erase_value(HotChains, From);
919 
920     // Invalidate caches.
921     for (auto EdgeIt : Into->Edges)
922       EdgeIt.second->invalidateCache();
923   }
924 
925   /// Concatenate all chains into the final order.
926   void concatChains(std::vector<uint64_t> &Order) {
927     // Collect chains and calculate density stats for their sorting.
928     std::vector<const ChainT *> SortedChains;
929     DenseMap<const ChainT *, double> ChainDensity;
930     for (ChainT &Chain : AllChains) {
931       if (!Chain.Nodes.empty()) {
932         SortedChains.push_back(&Chain);
933         // Using doubles to avoid overflow of ExecutionCounts.
934         double Size = 0;
935         double ExecutionCount = 0;
936         for (NodeT *Node : Chain.Nodes) {
937           Size += static_cast<double>(Node->Size);
938           ExecutionCount += static_cast<double>(Node->ExecutionCount);
939         }
940         assert(Size > 0 && "a chain of zero size");
941         ChainDensity[&Chain] = ExecutionCount / Size;
942       }
943     }
944 
945     // Sorting chains by density in the decreasing order.
946     std::sort(SortedChains.begin(), SortedChains.end(),
947               [&](const ChainT *L, const ChainT *R) {
948                 // Place the entry point is at the beginning of the order.
949                 if (L->isEntry() != R->isEntry())
950                   return L->isEntry();
951 
952                 const double DL = ChainDensity[L];
953                 const double DR = ChainDensity[R];
954                 // Compare by density and break ties by chain identifiers.
955                 return (DL != DR) ? (DL > DR) : (L->Id < R->Id);
956                 return std::make_tuple(-DL, L->Id) <
957                        std::make_tuple(-DR, R->Id);
958               });
959 
960     // Collect the nodes in the order specified by their chains.
961     Order.reserve(NumNodes);
962     for (const ChainT *Chain : SortedChains) {
963       for (NodeT *Node : Chain->Nodes) {
964         Order.push_back(Node->Index);
965       }
966     }
967   }
968 
969 private:
970   /// The number of nodes in the graph.
971   const size_t NumNodes;
972 
973   /// Successors of each node.
974   std::vector<std::vector<uint64_t>> SuccNodes;
975 
976   /// Predecessors of each node.
977   std::vector<std::vector<uint64_t>> PredNodes;
978 
979   /// All nodes (basic blocks) in the graph.
980   std::vector<NodeT> AllNodes;
981 
982   /// All jumps between the nodes.
983   std::vector<JumpT> AllJumps;
984 
985   /// All chains of nodes.
986   std::vector<ChainT> AllChains;
987 
988   /// All edges between the chains.
989   std::vector<ChainEdge> AllEdges;
990 
991   /// Active chains. The vector gets updated at runtime when chains are merged.
992   std::vector<ChainT *> HotChains;
993 };
994 
995 /// The implementation of the Cache-Directed Sort (CDS) algorithm for ordering
996 /// functions represented by a call graph.
997 class CDSortImpl {
998 public:
999   CDSortImpl(const CDSortConfig &Config, const std::vector<uint64_t> &NodeSizes,
1000              const std::vector<uint64_t> &NodeCounts,
1001              const std::vector<EdgeCountT> &EdgeCounts,
1002              const std::vector<uint64_t> &EdgeOffsets)
1003       : Config(Config), NumNodes(NodeSizes.size()) {
1004     initialize(NodeSizes, NodeCounts, EdgeCounts, EdgeOffsets);
1005   }
1006 
1007   /// Run the algorithm and return an ordered set of function clusters.
1008   void run(std::vector<uint64_t> &Result) {
1009     // Merge pairs of chains while improving the objective.
1010     mergeChainPairs();
1011 
1012     LLVM_DEBUG(dbgs() << "Cache-directed function sorting reduced the number"
1013                       << " of chains from " << NumNodes << " to "
1014                       << HotChains.size() << "\n");
1015 
1016     // Collect nodes from all the chains.
1017     concatChains(Result);
1018   }
1019 
1020 private:
1021   /// Initialize the algorithm's data structures.
1022   void initialize(const std::vector<uint64_t> &NodeSizes,
1023                   const std::vector<uint64_t> &NodeCounts,
1024                   const std::vector<EdgeCountT> &EdgeCounts,
1025                   const std::vector<uint64_t> &EdgeOffsets) {
1026     // Initialize nodes.
1027     AllNodes.reserve(NumNodes);
1028     for (uint64_t Node = 0; Node < NumNodes; Node++) {
1029       uint64_t Size = std::max<uint64_t>(NodeSizes[Node], 1ULL);
1030       uint64_t ExecutionCount = NodeCounts[Node];
1031       AllNodes.emplace_back(Node, Size, ExecutionCount);
1032       TotalSamples += ExecutionCount;
1033       if (ExecutionCount > 0)
1034         TotalSize += Size;
1035     }
1036 
1037     // Initialize jumps between the nodes.
1038     SuccNodes.resize(NumNodes);
1039     PredNodes.resize(NumNodes);
1040     AllJumps.reserve(EdgeCounts.size());
1041     for (size_t I = 0; I < EdgeCounts.size(); I++) {
1042       auto It = EdgeCounts[I];
1043       uint64_t Pred = It.first.first;
1044       uint64_t Succ = It.first.second;
1045       // Ignore recursive calls.
1046       if (Pred == Succ)
1047         continue;
1048 
1049       SuccNodes[Pred].push_back(Succ);
1050       PredNodes[Succ].push_back(Pred);
1051       uint64_t ExecutionCount = It.second;
1052       if (ExecutionCount > 0) {
1053         NodeT &PredNode = AllNodes[Pred];
1054         NodeT &SuccNode = AllNodes[Succ];
1055         AllJumps.emplace_back(&PredNode, &SuccNode, ExecutionCount);
1056         AllJumps.back().Offset = EdgeOffsets[I];
1057         SuccNode.InJumps.push_back(&AllJumps.back());
1058         PredNode.OutJumps.push_back(&AllJumps.back());
1059       }
1060     }
1061 
1062     // Initialize chains.
1063     AllChains.reserve(NumNodes);
1064     HotChains.reserve(NumNodes);
1065     for (NodeT &Node : AllNodes) {
1066       // Adjust execution counts.
1067       Node.ExecutionCount = std::max(Node.ExecutionCount, Node.inCount());
1068       Node.ExecutionCount = std::max(Node.ExecutionCount, Node.outCount());
1069       // Create chain.
1070       AllChains.emplace_back(Node.Index, &Node);
1071       Node.CurChain = &AllChains.back();
1072       if (Node.ExecutionCount > 0)
1073         HotChains.push_back(&AllChains.back());
1074     }
1075 
1076     // Initialize chain edges.
1077     AllEdges.reserve(AllJumps.size());
1078     for (NodeT &PredNode : AllNodes) {
1079       for (JumpT *Jump : PredNode.OutJumps) {
1080         NodeT *SuccNode = Jump->Target;
1081         ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain);
1082         // this edge is already present in the graph.
1083         if (CurEdge != nullptr) {
1084           assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr);
1085           CurEdge->appendJump(Jump);
1086           continue;
1087         }
1088         // this is a new edge.
1089         AllEdges.emplace_back(Jump);
1090         PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back());
1091         SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back());
1092       }
1093     }
1094   }
1095 
1096   /// Merge pairs of chains while there is an improvement in the objective.
1097   void mergeChainPairs() {
1098     // Create a priority queue containing all edges ordered by the merge gain.
1099     auto GainComparator = [](ChainEdge *L, ChainEdge *R) {
1100       return std::make_tuple(-L->gain(), L->srcChain()->Id, L->dstChain()->Id) <
1101              std::make_tuple(-R->gain(), R->srcChain()->Id, R->dstChain()->Id);
1102     };
1103     std::set<ChainEdge *, decltype(GainComparator)> Queue(GainComparator);
1104 
1105     // Insert the edges into the queue.
1106     for (ChainT *ChainPred : HotChains) {
1107       for (const auto &[Chain, Edge] : ChainPred->Edges) {
1108         // Ignore self-edges.
1109         if (Edge->isSelfEdge())
1110           continue;
1111         // Ignore already processed edges.
1112         if (Edge->gain() != -1.0)
1113           continue;
1114 
1115         // Compute the gain of merging the two chains.
1116         MergeGainT Gain = getBestMergeGain(Edge);
1117         Edge->setMergeGain(Gain);
1118 
1119         if (Edge->gain() > EPS)
1120           Queue.insert(Edge);
1121       }
1122     }
1123 
1124     // Merge the chains while the gain of merging is positive.
1125     while (!Queue.empty()) {
1126       // Extract the best (top) edge for merging.
1127       ChainEdge *BestEdge = *Queue.begin();
1128       Queue.erase(Queue.begin());
1129       // Ignore self-edges.
1130       if (BestEdge->isSelfEdge())
1131         continue;
1132       // Ignore edges with non-positive gains.
1133       if (BestEdge->gain() <= EPS)
1134         continue;
1135 
1136       ChainT *BestSrcChain = BestEdge->srcChain();
1137       ChainT *BestDstChain = BestEdge->dstChain();
1138 
1139       // Remove outdated edges from the queue.
1140       for (const auto &[Chain, ChainEdge] : BestSrcChain->Edges)
1141         Queue.erase(ChainEdge);
1142       for (const auto &[Chain, ChainEdge] : BestDstChain->Edges)
1143         Queue.erase(ChainEdge);
1144 
1145       // Merge the best pair of chains.
1146       MergeGainT BestGain = BestEdge->getMergeGain();
1147       mergeChains(BestSrcChain, BestDstChain, BestGain.mergeOffset(),
1148                   BestGain.mergeType());
1149 
1150       // Insert newly created edges into the queue.
1151       for (const auto &[Chain, Edge] : BestSrcChain->Edges) {
1152         // Ignore loop edges.
1153         if (Edge->isSelfEdge())
1154           continue;
1155 
1156         // Compute the gain of merging the two chains.
1157         MergeGainT Gain = getBestMergeGain(Edge);
1158         Edge->setMergeGain(Gain);
1159 
1160         if (Edge->gain() > EPS)
1161           Queue.insert(Edge);
1162       }
1163     }
1164   }
1165 
1166   /// Compute the gain of merging two chains.
1167   ///
1168   /// The function considers all possible ways of merging two chains and
1169   /// computes the one having the largest increase in ExtTSP objective. The
1170   /// result is a pair with the first element being the gain and the second
1171   /// element being the corresponding merging type.
1172   MergeGainT getBestMergeGain(ChainEdge *Edge) const {
1173     // Precompute jumps between ChainPred and ChainSucc.
1174     auto Jumps = Edge->jumps();
1175     assert(!Jumps.empty() && "trying to merge chains w/o jumps");
1176     ChainT *SrcChain = Edge->srcChain();
1177     ChainT *DstChain = Edge->dstChain();
1178 
1179     // This object holds the best currently chosen gain of merging two chains.
1180     MergeGainT Gain = MergeGainT();
1181 
1182     /// Given a list of merge types, try to merge two chains and update Gain
1183     /// with a better alternative.
1184     auto tryChainMerging = [&](const std::vector<MergeTypeT> &MergeTypes) {
1185       // Apply the merge, compute the corresponding gain, and update the best
1186       // value, if the merge is beneficial.
1187       for (const MergeTypeT &MergeType : MergeTypes) {
1188         MergeGainT NewGain =
1189             computeMergeGain(SrcChain, DstChain, Jumps, MergeType);
1190 
1191         // When forward and backward gains are the same, prioritize merging that
1192         // preserves the original order of the functions in the binary.
1193         if (std::abs(Gain.score() - NewGain.score()) < EPS) {
1194           if ((MergeType == MergeTypeT::X_Y && SrcChain->Id < DstChain->Id) ||
1195               (MergeType == MergeTypeT::Y_X && SrcChain->Id > DstChain->Id)) {
1196             Gain = NewGain;
1197           }
1198         } else if (NewGain.score() > Gain.score() + EPS) {
1199           Gain = NewGain;
1200         }
1201       }
1202     };
1203 
1204     // Try to concatenate two chains w/o splitting.
1205     tryChainMerging({MergeTypeT::X_Y, MergeTypeT::Y_X});
1206 
1207     return Gain;
1208   }
1209 
1210   /// Compute the score gain of merging two chains, respecting a given type.
1211   ///
1212   /// The two chains are not modified in the method.
1213   MergeGainT computeMergeGain(ChainT *ChainPred, ChainT *ChainSucc,
1214                               const std::vector<JumpT *> &Jumps,
1215                               MergeTypeT MergeType) const {
1216     // This doesn't depend on the ordering of the nodes
1217     double FreqGain = freqBasedLocalityGain(ChainPred, ChainSucc);
1218 
1219     // Merge offset is always 0, as the chains are not split.
1220     size_t MergeOffset = 0;
1221     auto MergedBlocks =
1222         mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType);
1223     double DistGain = distBasedLocalityGain(MergedBlocks, Jumps);
1224 
1225     double GainScore = DistGain + Config.FrequencyScale * FreqGain;
1226     // Scale the result to increase the importance of merging short chains.
1227     if (GainScore >= 0.0)
1228       GainScore /= std::min(ChainPred->Size, ChainSucc->Size);
1229 
1230     return MergeGainT(GainScore, MergeOffset, MergeType);
1231   }
1232 
1233   /// Compute the change of the frequency locality after merging the chains.
1234   double freqBasedLocalityGain(ChainT *ChainPred, ChainT *ChainSucc) const {
1235     auto missProbability = [&](double ChainDensity) {
1236       double PageSamples = ChainDensity * Config.CacheSize;
1237       if (PageSamples >= TotalSamples)
1238         return 0.0;
1239       double P = PageSamples / TotalSamples;
1240       return pow(1.0 - P, static_cast<double>(Config.CacheEntries));
1241     };
1242 
1243     // Cache misses on the chains before merging.
1244     double CurScore =
1245         ChainPred->ExecutionCount * missProbability(ChainPred->density()) +
1246         ChainSucc->ExecutionCount * missProbability(ChainSucc->density());
1247 
1248     // Cache misses on the merged chain
1249     double MergedCounts = ChainPred->ExecutionCount + ChainSucc->ExecutionCount;
1250     double MergedSize = ChainPred->Size + ChainSucc->Size;
1251     double MergedDensity = static_cast<double>(MergedCounts) / MergedSize;
1252     double NewScore = MergedCounts * missProbability(MergedDensity);
1253 
1254     return CurScore - NewScore;
1255   }
1256 
1257   /// Compute the distance locality for a jump / call.
1258   double distScore(uint64_t SrcAddr, uint64_t DstAddr, uint64_t Count) const {
1259     uint64_t Dist = SrcAddr <= DstAddr ? DstAddr - SrcAddr : SrcAddr - DstAddr;
1260     double D = Dist == 0 ? 0.1 : static_cast<double>(Dist);
1261     return static_cast<double>(Count) * std::pow(D, -Config.DistancePower);
1262   }
1263 
1264   /// Compute the change of the distance locality after merging the chains.
1265   double distBasedLocalityGain(const MergedChain &MergedBlocks,
1266                                const std::vector<JumpT *> &Jumps) const {
1267     if (Jumps.empty())
1268       return 0.0;
1269     uint64_t CurAddr = 0;
1270     MergedBlocks.forEach([&](const NodeT *Node) {
1271       Node->EstimatedAddr = CurAddr;
1272       CurAddr += Node->Size;
1273     });
1274 
1275     double CurScore = 0;
1276     double NewScore = 0;
1277     for (const JumpT *Arc : Jumps) {
1278       uint64_t SrcAddr = Arc->Source->EstimatedAddr + Arc->Offset;
1279       uint64_t DstAddr = Arc->Target->EstimatedAddr;
1280       NewScore += distScore(SrcAddr, DstAddr, Arc->ExecutionCount);
1281       CurScore += distScore(0, TotalSize, Arc->ExecutionCount);
1282     }
1283     return NewScore - CurScore;
1284   }
1285 
1286   /// Merge chain From into chain Into, update the list of active chains,
1287   /// adjacency information, and the corresponding cached values.
1288   void mergeChains(ChainT *Into, ChainT *From, size_t MergeOffset,
1289                    MergeTypeT MergeType) {
1290     assert(Into != From && "a chain cannot be merged with itself");
1291 
1292     // Merge the nodes.
1293     MergedChain MergedNodes =
1294         mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType);
1295     Into->merge(From, MergedNodes.getNodes());
1296 
1297     // Merge the edges.
1298     Into->mergeEdges(From);
1299     From->clear();
1300 
1301     // Remove the chain from the list of active chains.
1302     llvm::erase_value(HotChains, From);
1303   }
1304 
1305   /// Concatenate all chains into the final order.
1306   void concatChains(std::vector<uint64_t> &Order) {
1307     // Collect chains and calculate density stats for their sorting.
1308     std::vector<const ChainT *> SortedChains;
1309     DenseMap<const ChainT *, double> ChainDensity;
1310     for (ChainT &Chain : AllChains) {
1311       if (!Chain.Nodes.empty()) {
1312         SortedChains.push_back(&Chain);
1313         // Using doubles to avoid overflow of ExecutionCounts.
1314         double Size = 0;
1315         double ExecutionCount = 0;
1316         for (NodeT *Node : Chain.Nodes) {
1317           Size += static_cast<double>(Node->Size);
1318           ExecutionCount += static_cast<double>(Node->ExecutionCount);
1319         }
1320         assert(Size > 0 && "a chain of zero size");
1321         ChainDensity[&Chain] = ExecutionCount / Size;
1322       }
1323     }
1324 
1325     // Sort chains by density in the decreasing order.
1326     std::sort(SortedChains.begin(), SortedChains.end(),
1327               [&](const ChainT *L, const ChainT *R) {
1328                 const double DL = ChainDensity[L];
1329                 const double DR = ChainDensity[R];
1330                 // Compare by density and break ties by chain identifiers.
1331                 return std::make_tuple(-DL, L->Id) <
1332                        std::make_tuple(-DR, R->Id);
1333               });
1334 
1335     // Collect the nodes in the order specified by their chains.
1336     Order.reserve(NumNodes);
1337     for (const ChainT *Chain : SortedChains)
1338       for (NodeT *Node : Chain->Nodes)
1339         Order.push_back(Node->Index);
1340   }
1341 
1342 private:
1343   /// Config for the algorithm.
1344   const CDSortConfig Config;
1345 
1346   /// The number of nodes in the graph.
1347   const size_t NumNodes;
1348 
1349   /// Successors of each node.
1350   std::vector<std::vector<uint64_t>> SuccNodes;
1351 
1352   /// Predecessors of each node.
1353   std::vector<std::vector<uint64_t>> PredNodes;
1354 
1355   /// All nodes (functions) in the graph.
1356   std::vector<NodeT> AllNodes;
1357 
1358   /// All jumps (function calls) between the nodes.
1359   std::vector<JumpT> AllJumps;
1360 
1361   /// All chains of nodes.
1362   std::vector<ChainT> AllChains;
1363 
1364   /// All edges between the chains.
1365   std::vector<ChainEdge> AllEdges;
1366 
1367   /// Active chains. The vector gets updated at runtime when chains are merged.
1368   std::vector<ChainT *> HotChains;
1369 
1370   /// The total number of samples in the graph.
1371   uint64_t TotalSamples{0};
1372 
1373   /// The total size of the nodes in the graph.
1374   uint64_t TotalSize{0};
1375 };
1376 
1377 } // end of anonymous namespace
1378 
1379 std::vector<uint64_t>
1380 llvm::applyExtTspLayout(const std::vector<uint64_t> &NodeSizes,
1381                         const std::vector<uint64_t> &NodeCounts,
1382                         const std::vector<EdgeCountT> &EdgeCounts) {
1383   // Verify correctness of the input data.
1384   assert(NodeCounts.size() == NodeSizes.size() && "Incorrect input");
1385   assert(NodeSizes.size() > 2 && "Incorrect input");
1386 
1387   // Apply the reordering algorithm.
1388   ExtTSPImpl Alg(NodeSizes, NodeCounts, EdgeCounts);
1389   std::vector<uint64_t> Result;
1390   Alg.run(Result);
1391 
1392   // Verify correctness of the output.
1393   assert(Result.front() == 0 && "Original entry point is not preserved");
1394   assert(Result.size() == NodeSizes.size() && "Incorrect size of layout");
1395   return Result;
1396 }
1397 
1398 double llvm::calcExtTspScore(const std::vector<uint64_t> &Order,
1399                              const std::vector<uint64_t> &NodeSizes,
1400                              const std::vector<uint64_t> &NodeCounts,
1401                              const std::vector<EdgeCountT> &EdgeCounts) {
1402   // Estimate addresses of the blocks in memory.
1403   std::vector<uint64_t> Addr(NodeSizes.size(), 0);
1404   for (size_t Idx = 1; Idx < Order.size(); Idx++) {
1405     Addr[Order[Idx]] = Addr[Order[Idx - 1]] + NodeSizes[Order[Idx - 1]];
1406   }
1407   std::vector<uint64_t> OutDegree(NodeSizes.size(), 0);
1408   for (auto It : EdgeCounts) {
1409     uint64_t Pred = It.first.first;
1410     OutDegree[Pred]++;
1411   }
1412 
1413   // Increase the score for each jump.
1414   double Score = 0;
1415   for (auto It : EdgeCounts) {
1416     uint64_t Pred = It.first.first;
1417     uint64_t Succ = It.first.second;
1418     uint64_t Count = It.second;
1419     bool IsConditional = OutDegree[Pred] > 1;
1420     Score += ::extTSPScore(Addr[Pred], NodeSizes[Pred], Addr[Succ], Count,
1421                            IsConditional);
1422   }
1423   return Score;
1424 }
1425 
1426 double llvm::calcExtTspScore(const std::vector<uint64_t> &NodeSizes,
1427                              const std::vector<uint64_t> &NodeCounts,
1428                              const std::vector<EdgeCountT> &EdgeCounts) {
1429   std::vector<uint64_t> Order(NodeSizes.size());
1430   for (size_t Idx = 0; Idx < NodeSizes.size(); Idx++) {
1431     Order[Idx] = Idx;
1432   }
1433   return calcExtTspScore(Order, NodeSizes, NodeCounts, EdgeCounts);
1434 }
1435 
1436 std::vector<uint64_t>
1437 llvm::applyCDSLayout(const CDSortConfig &Config,
1438                      const std::vector<uint64_t> &FuncSizes,
1439                      const std::vector<uint64_t> &FuncCounts,
1440                      const std::vector<EdgeCountT> &CallCounts,
1441                      const std::vector<uint64_t> &CallOffsets) {
1442   // Verify correctness of the input data.
1443   assert(FuncCounts.size() == FuncSizes.size() && "Incorrect input");
1444 
1445   // Apply the reordering algorithm.
1446   CDSortImpl Alg(Config, FuncSizes, FuncCounts, CallCounts, CallOffsets);
1447   std::vector<uint64_t> Result;
1448   Alg.run(Result);
1449 
1450   // Verify correctness of the output.
1451   assert(Result.size() == FuncSizes.size() && "Incorrect size of layout");
1452   return Result;
1453 }
1454 
1455 std::vector<uint64_t>
1456 llvm::applyCDSLayout(const std::vector<uint64_t> &FuncSizes,
1457                      const std::vector<uint64_t> &FuncCounts,
1458                      const std::vector<EdgeCountT> &CallCounts,
1459                      const std::vector<uint64_t> &CallOffsets) {
1460   CDSortConfig Config;
1461   // Populate the config from the command-line options.
1462   if (CacheEntries.getNumOccurrences() > 0)
1463     Config.CacheEntries = CacheEntries;
1464   if (CacheSize.getNumOccurrences() > 0)
1465     Config.CacheSize = CacheSize;
1466   if (DistancePower.getNumOccurrences() > 0)
1467     Config.DistancePower = DistancePower;
1468   if (FrequencyScale.getNumOccurrences() > 0)
1469     Config.FrequencyScale = FrequencyScale;
1470   return applyCDSLayout(Config, FuncSizes, FuncCounts, CallCounts, CallOffsets);
1471 }
1472