xref: /llvm-project/llvm/lib/Transforms/Utils/CodeLayout.cpp (revision 0a7bf3aad692c5bb591cac605a19980b00325d50)
1 //===- CodeLayout.cpp - Implementation of code layout algorithms ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // The file implements "cache-aware" layout algorithms of basic blocks and
10 // functions in a binary.
11 //
12 // The algorithm tries to find a layout of nodes (basic blocks) of a given CFG
13 // optimizing jump locality and thus processor I-cache utilization. This is
14 // achieved via increasing the number of fall-through jumps and co-locating
15 // frequently executed nodes together. The name follows the underlying
16 // optimization problem, Extended-TSP, which is a generalization of classical
17 // (maximum) Traveling Salesmen Problem.
18 //
19 // The algorithm is a greedy heuristic that works with chains (ordered lists)
20 // of basic blocks. Initially all chains are isolated basic blocks. On every
21 // iteration, we pick a pair of chains whose merging yields the biggest increase
22 // in the ExtTSP score, which models how i-cache "friendly" a specific chain is.
23 // A pair of chains giving the maximum gain is merged into a new chain. The
24 // procedure stops when there is only one chain left, or when merging does not
25 // increase ExtTSP. In the latter case, the remaining chains are sorted by
26 // density in the decreasing order.
27 //
28 // An important aspect is the way two chains are merged. Unlike earlier
29 // algorithms (e.g., based on the approach of Pettis-Hansen), two
30 // chains, X and Y, are first split into three, X1, X2, and Y. Then we
31 // consider all possible ways of gluing the three chains (e.g., X1YX2, X1X2Y,
32 // X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the largest score.
33 // This improves the quality of the final result (the search space is larger)
34 // while keeping the implementation sufficiently fast.
35 //
36 // Reference:
37 //   * A. Newell and S. Pupyrev, Improved Basic Block Reordering,
38 //     IEEE Transactions on Computers, 2020
39 //     https://arxiv.org/abs/1809.04676
40 //
41 //===----------------------------------------------------------------------===//
42 
43 #include "llvm/Transforms/Utils/CodeLayout.h"
44 #include "llvm/Support/CommandLine.h"
45 #include "llvm/Support/Debug.h"
46 
47 #include <cmath>
48 #include <set>
49 
50 using namespace llvm;
51 using namespace llvm::codelayout;
52 
53 #define DEBUG_TYPE "code-layout"
54 
55 namespace llvm {
56 cl::opt<bool> EnableExtTspBlockPlacement(
57     "enable-ext-tsp-block-placement", cl::Hidden, cl::init(false),
58     cl::desc("Enable machine block placement based on the ext-tsp model, "
59              "optimizing I-cache utilization."));
60 
61 cl::opt<bool> ApplyExtTspWithoutProfile(
62     "ext-tsp-apply-without-profile",
63     cl::desc("Whether to apply ext-tsp placement for instances w/o profile"),
64     cl::init(true), cl::Hidden);
65 } // namespace llvm
66 
67 // Algorithm-specific params for Ext-TSP. The values are tuned for the best
68 // performance of large-scale front-end bound binaries.
69 static cl::opt<double> ForwardWeightCond(
70     "ext-tsp-forward-weight-cond", cl::ReallyHidden, cl::init(0.1),
71     cl::desc("The weight of conditional forward jumps for ExtTSP value"));
72 
73 static cl::opt<double> ForwardWeightUncond(
74     "ext-tsp-forward-weight-uncond", cl::ReallyHidden, cl::init(0.1),
75     cl::desc("The weight of unconditional forward jumps for ExtTSP value"));
76 
77 static cl::opt<double> BackwardWeightCond(
78     "ext-tsp-backward-weight-cond", cl::ReallyHidden, cl::init(0.1),
79     cl::desc("The weight of conditional backward jumps for ExtTSP value"));
80 
81 static cl::opt<double> BackwardWeightUncond(
82     "ext-tsp-backward-weight-uncond", cl::ReallyHidden, cl::init(0.1),
83     cl::desc("The weight of unconditional backward jumps for ExtTSP value"));
84 
85 static cl::opt<double> FallthroughWeightCond(
86     "ext-tsp-fallthrough-weight-cond", cl::ReallyHidden, cl::init(1.0),
87     cl::desc("The weight of conditional fallthrough jumps for ExtTSP value"));
88 
89 static cl::opt<double> FallthroughWeightUncond(
90     "ext-tsp-fallthrough-weight-uncond", cl::ReallyHidden, cl::init(1.05),
91     cl::desc("The weight of unconditional fallthrough jumps for ExtTSP value"));
92 
93 static cl::opt<unsigned> ForwardDistance(
94     "ext-tsp-forward-distance", cl::ReallyHidden, cl::init(1024),
95     cl::desc("The maximum distance (in bytes) of a forward jump for ExtTSP"));
96 
97 static cl::opt<unsigned> BackwardDistance(
98     "ext-tsp-backward-distance", cl::ReallyHidden, cl::init(640),
99     cl::desc("The maximum distance (in bytes) of a backward jump for ExtTSP"));
100 
101 // The maximum size of a chain created by the algorithm. The size is bounded
102 // so that the algorithm can efficiently process extremely large instances.
103 static cl::opt<unsigned>
104     MaxChainSize("ext-tsp-max-chain-size", cl::ReallyHidden, cl::init(512),
105                  cl::desc("The maximum size of a chain to create"));
106 
107 // The maximum ratio between densities of two chains for merging.
108 static cl::opt<double> MaxMergeDensityRatio(
109     "ext-tsp-max-merge-density-ratio", cl::ReallyHidden, cl::init(128),
110     cl::desc("The maximum ratio between densities of two chains for merging"));
111 
112 // Algorithm-specific options for CDS.
113 static cl::opt<unsigned> CacheEntries("cds-cache-entries", cl::ReallyHidden,
114                                       cl::desc("The size of the cache"));
115 
116 static cl::opt<unsigned> CacheSize("cds-cache-size", cl::ReallyHidden,
117                                    cl::desc("The size of a line in the cache"));
118 
119 static cl::opt<double> DistancePower(
120     "cds-distance-power", cl::ReallyHidden,
121     cl::desc("The power exponent for the distance-based locality"));
122 
123 static cl::opt<double> FrequencyScale(
124     "cds-frequency-scale", cl::ReallyHidden,
125     cl::desc("The scale factor for the frequency-based locality"));
126 
127 namespace {
128 
129 // Epsilon for comparison of doubles.
130 constexpr double EPS = 1e-8;
131 
132 // Compute the Ext-TSP score for a given jump.
133 double jumpExtTSPScore(uint64_t JumpDist, uint64_t JumpMaxDist, uint64_t Count,
134                        double Weight) {
135   if (JumpDist > JumpMaxDist)
136     return 0;
137   double Prob = 1.0 - static_cast<double>(JumpDist) / JumpMaxDist;
138   return Weight * Prob * Count;
139 }
140 
141 // Compute the Ext-TSP score for a jump between a given pair of blocks,
142 // using their sizes, (estimated) addresses and the jump execution count.
143 double extTSPScore(uint64_t SrcAddr, uint64_t SrcSize, uint64_t DstAddr,
144                    uint64_t Count, bool IsConditional) {
145   // Fallthrough
146   if (SrcAddr + SrcSize == DstAddr) {
147     return jumpExtTSPScore(0, 1, Count,
148                            IsConditional ? FallthroughWeightCond
149                                          : FallthroughWeightUncond);
150   }
151   // Forward
152   if (SrcAddr + SrcSize < DstAddr) {
153     const uint64_t Dist = DstAddr - (SrcAddr + SrcSize);
154     return jumpExtTSPScore(Dist, ForwardDistance, Count,
155                            IsConditional ? ForwardWeightCond
156                                          : ForwardWeightUncond);
157   }
158   // Backward
159   const uint64_t Dist = SrcAddr + SrcSize - DstAddr;
160   return jumpExtTSPScore(Dist, BackwardDistance, Count,
161                          IsConditional ? BackwardWeightCond
162                                        : BackwardWeightUncond);
163 }
164 
165 /// A type of merging two chains, X and Y. The former chain is split into
166 /// X1 and X2 and then concatenated with Y in the order specified by the type.
167 enum class MergeTypeT : int { X_Y, Y_X, X1_Y_X2, Y_X2_X1, X2_X1_Y };
168 
169 /// The gain of merging two chains, that is, the Ext-TSP score of the merge
170 /// together with the corresponding merge 'type' and 'offset'.
171 struct MergeGainT {
172   explicit MergeGainT() = default;
173   explicit MergeGainT(double Score, size_t MergeOffset, MergeTypeT MergeType)
174       : Score(Score), MergeOffset(MergeOffset), MergeType(MergeType) {}
175 
176   double score() const { return Score; }
177 
178   size_t mergeOffset() const { return MergeOffset; }
179 
180   MergeTypeT mergeType() const { return MergeType; }
181 
182   void setMergeType(MergeTypeT Ty) { MergeType = Ty; }
183 
184   // Returns 'true' iff Other is preferred over this.
185   bool operator<(const MergeGainT &Other) const {
186     return (Other.Score > EPS && Other.Score > Score + EPS);
187   }
188 
189   // Update the current gain if Other is preferred over this.
190   void updateIfLessThan(const MergeGainT &Other) {
191     if (*this < Other)
192       *this = Other;
193   }
194 
195 private:
196   double Score{-1.0};
197   size_t MergeOffset{0};
198   MergeTypeT MergeType{MergeTypeT::X_Y};
199 };
200 
201 struct JumpT;
202 struct ChainT;
203 struct ChainEdge;
204 
205 /// A node in the graph, typically corresponding to a basic block in the CFG or
206 /// a function in the call graph.
207 struct NodeT {
208   NodeT(const NodeT &) = delete;
209   NodeT(NodeT &&) = default;
210   NodeT &operator=(const NodeT &) = delete;
211   NodeT &operator=(NodeT &&) = default;
212 
213   explicit NodeT(size_t Index, uint64_t Size, uint64_t Count)
214       : Index(Index), Size(Size), ExecutionCount(Count) {}
215 
216   bool isEntry() const { return Index == 0; }
217 
218   // Check if Other is a successor of the node.
219   bool isSuccessor(const NodeT *Other) const;
220   // The total execution count of outgoing jumps.
221   uint64_t outCount() const;
222 
223   // The total execution count of incoming jumps.
224   uint64_t inCount() const;
225 
226   // The original index of the node in graph.
227   size_t Index{0};
228   // The index of the node in the current chain.
229   size_t CurIndex{0};
230   // The size of the node in the binary.
231   uint64_t Size{0};
232   // The execution count of the node in the profile data.
233   uint64_t ExecutionCount{0};
234   // The current chain of the node.
235   ChainT *CurChain{nullptr};
236   // The offset of the node in the current chain.
237   mutable uint64_t EstimatedAddr{0};
238   // Forced successor of the node in the graph.
239   NodeT *ForcedSucc{nullptr};
240   // Forced predecessor of the node in the graph.
241   NodeT *ForcedPred{nullptr};
242   // Outgoing jumps from the node.
243   std::vector<JumpT *> OutJumps;
244   // Incoming jumps to the node.
245   std::vector<JumpT *> InJumps;
246 };
247 
248 /// An arc in the graph, typically corresponding to a jump between two nodes.
249 struct JumpT {
250   JumpT(const JumpT &) = delete;
251   JumpT(JumpT &&) = default;
252   JumpT &operator=(const JumpT &) = delete;
253   JumpT &operator=(JumpT &&) = default;
254 
255   explicit JumpT(NodeT *Source, NodeT *Target, uint64_t ExecutionCount)
256       : Source(Source), Target(Target), ExecutionCount(ExecutionCount) {}
257 
258   // Source node of the jump.
259   NodeT *Source;
260   // Target node of the jump.
261   NodeT *Target;
262   // Execution count of the arc in the profile data.
263   uint64_t ExecutionCount{0};
264   // Whether the jump corresponds to a conditional branch.
265   bool IsConditional{false};
266   // The offset of the jump from the source node.
267   uint64_t Offset{0};
268 };
269 
270 /// A chain (ordered sequence) of nodes in the graph.
271 struct ChainT {
272   ChainT(const ChainT &) = delete;
273   ChainT(ChainT &&) = default;
274   ChainT &operator=(const ChainT &) = delete;
275   ChainT &operator=(ChainT &&) = default;
276 
277   explicit ChainT(uint64_t Id, NodeT *Node)
278       : Id(Id), ExecutionCount(Node->ExecutionCount), Size(Node->Size),
279         Nodes(1, Node) {}
280 
281   size_t numBlocks() const { return Nodes.size(); }
282 
283   double density() const { return static_cast<double>(ExecutionCount) / Size; }
284 
285   bool isEntry() const { return Nodes[0]->Index == 0; }
286 
287   bool isCold() const {
288     for (NodeT *Node : Nodes) {
289       if (Node->ExecutionCount > 0)
290         return false;
291     }
292     return true;
293   }
294 
295   ChainEdge *getEdge(ChainT *Other) const {
296     for (const auto &[Chain, ChainEdge] : Edges) {
297       if (Chain == Other)
298         return ChainEdge;
299     }
300     return nullptr;
301   }
302 
303   void removeEdge(ChainT *Other) {
304     auto It = Edges.begin();
305     while (It != Edges.end()) {
306       if (It->first == Other) {
307         Edges.erase(It);
308         return;
309       }
310       It++;
311     }
312   }
313 
314   void addEdge(ChainT *Other, ChainEdge *Edge) {
315     Edges.push_back(std::make_pair(Other, Edge));
316   }
317 
318   void merge(ChainT *Other, std::vector<NodeT *> MergedBlocks) {
319     Nodes = std::move(MergedBlocks);
320     // Update the chain's data.
321     ExecutionCount += Other->ExecutionCount;
322     Size += Other->Size;
323     Id = Nodes[0]->Index;
324     // Update the node's data.
325     for (size_t Idx = 0; Idx < Nodes.size(); Idx++) {
326       Nodes[Idx]->CurChain = this;
327       Nodes[Idx]->CurIndex = Idx;
328     }
329   }
330 
331   void mergeEdges(ChainT *Other);
332 
333   void clear() {
334     Nodes.clear();
335     Nodes.shrink_to_fit();
336     Edges.clear();
337     Edges.shrink_to_fit();
338   }
339 
340   // Unique chain identifier.
341   uint64_t Id;
342   // Cached ext-tsp score for the chain.
343   double Score{0};
344   // The total execution count of the chain.
345   uint64_t ExecutionCount{0};
346   // The total size of the chain.
347   uint64_t Size{0};
348   // Nodes of the chain.
349   std::vector<NodeT *> Nodes;
350   // Adjacent chains and corresponding edges (lists of jumps).
351   std::vector<std::pair<ChainT *, ChainEdge *>> Edges;
352 };
353 
354 /// An edge in the graph representing jumps between two chains.
355 /// When nodes are merged into chains, the edges are combined too so that
356 /// there is always at most one edge between a pair of chains.
357 struct ChainEdge {
358   ChainEdge(const ChainEdge &) = delete;
359   ChainEdge(ChainEdge &&) = default;
360   ChainEdge &operator=(const ChainEdge &) = delete;
361   ChainEdge &operator=(ChainEdge &&) = delete;
362 
363   explicit ChainEdge(JumpT *Jump)
364       : SrcChain(Jump->Source->CurChain), DstChain(Jump->Target->CurChain),
365         Jumps(1, Jump) {}
366 
367   ChainT *srcChain() const { return SrcChain; }
368 
369   ChainT *dstChain() const { return DstChain; }
370 
371   bool isSelfEdge() const { return SrcChain == DstChain; }
372 
373   const std::vector<JumpT *> &jumps() const { return Jumps; }
374 
375   void appendJump(JumpT *Jump) { Jumps.push_back(Jump); }
376 
377   void moveJumps(ChainEdge *Other) {
378     Jumps.insert(Jumps.end(), Other->Jumps.begin(), Other->Jumps.end());
379     Other->Jumps.clear();
380     Other->Jumps.shrink_to_fit();
381   }
382 
383   void changeEndpoint(ChainT *From, ChainT *To) {
384     if (From == SrcChain)
385       SrcChain = To;
386     if (From == DstChain)
387       DstChain = To;
388   }
389 
390   bool hasCachedMergeGain(ChainT *Src, ChainT *Dst) const {
391     return Src == SrcChain ? CacheValidForward : CacheValidBackward;
392   }
393 
394   MergeGainT getCachedMergeGain(ChainT *Src, ChainT *Dst) const {
395     return Src == SrcChain ? CachedGainForward : CachedGainBackward;
396   }
397 
398   void setCachedMergeGain(ChainT *Src, ChainT *Dst, MergeGainT MergeGain) {
399     if (Src == SrcChain) {
400       CachedGainForward = MergeGain;
401       CacheValidForward = true;
402     } else {
403       CachedGainBackward = MergeGain;
404       CacheValidBackward = true;
405     }
406   }
407 
408   void invalidateCache() {
409     CacheValidForward = false;
410     CacheValidBackward = false;
411   }
412 
413   void setMergeGain(MergeGainT Gain) { CachedGain = Gain; }
414 
415   MergeGainT getMergeGain() const { return CachedGain; }
416 
417   double gain() const { return CachedGain.score(); }
418 
419 private:
420   // Source chain.
421   ChainT *SrcChain{nullptr};
422   // Destination chain.
423   ChainT *DstChain{nullptr};
424   // Original jumps in the binary with corresponding execution counts.
425   std::vector<JumpT *> Jumps;
426   // Cached gain value for merging the pair of chains.
427   MergeGainT CachedGain;
428 
429   // Cached gain values for merging the pair of chains. Since the gain of
430   // merging (Src, Dst) and (Dst, Src) might be different, we store both values
431   // here and a flag indicating which of the options results in a higher gain.
432   // Cached gain values.
433   MergeGainT CachedGainForward;
434   MergeGainT CachedGainBackward;
435   // Whether the cached value must be recomputed.
436   bool CacheValidForward{false};
437   bool CacheValidBackward{false};
438 };
439 
440 bool NodeT::isSuccessor(const NodeT *Other) const {
441   for (JumpT *Jump : OutJumps) {
442     if (Jump->Target == Other)
443       return true;
444   }
445   return false;
446 }
447 
448 uint64_t NodeT::outCount() const {
449   uint64_t Count = 0;
450   for (JumpT *Jump : OutJumps)
451     Count += Jump->ExecutionCount;
452   return Count;
453 }
454 
455 uint64_t NodeT::inCount() const {
456   uint64_t Count = 0;
457   for (JumpT *Jump : InJumps)
458     Count += Jump->ExecutionCount;
459   return Count;
460 }
461 
462 void ChainT::mergeEdges(ChainT *Other) {
463   // Update edges adjacent to chain Other.
464   for (const auto &[DstChain, DstEdge] : Other->Edges) {
465     ChainT *TargetChain = DstChain == Other ? this : DstChain;
466     ChainEdge *CurEdge = getEdge(TargetChain);
467     if (CurEdge == nullptr) {
468       DstEdge->changeEndpoint(Other, this);
469       this->addEdge(TargetChain, DstEdge);
470       if (DstChain != this && DstChain != Other)
471         DstChain->addEdge(this, DstEdge);
472     } else {
473       CurEdge->moveJumps(DstEdge);
474     }
475     // Cleanup leftover edge.
476     if (DstChain != Other)
477       DstChain->removeEdge(Other);
478   }
479 }
480 
481 /// A wrapper around three concatenated vectors (chains) of objects; it is used
482 /// to avoid extra instantiation of the vectors.
483 template <typename ObjType> struct MergedVector {
484   using ObjIter = typename std::vector<ObjType *>::const_iterator;
485 
486   MergedVector(ObjIter Begin1, ObjIter End1, ObjIter Begin2 = ObjIter(),
487                ObjIter End2 = ObjIter(), ObjIter Begin3 = ObjIter(),
488                ObjIter End3 = ObjIter())
489       : Begin1(Begin1), End1(End1), Begin2(Begin2), End2(End2), Begin3(Begin3),
490         End3(End3) {}
491 
492   template <typename F> void forEach(const F &Func) const {
493     for (auto It = Begin1; It != End1; It++)
494       Func(*It);
495     for (auto It = Begin2; It != End2; It++)
496       Func(*It);
497     for (auto It = Begin3; It != End3; It++)
498       Func(*It);
499   }
500 
501   std::vector<NodeT *> getNodes() const {
502     std::vector<NodeT *> Result;
503     Result.reserve(std::distance(Begin1, End1) + std::distance(Begin2, End2) +
504                    std::distance(Begin3, End3));
505     Result.insert(Result.end(), Begin1, End1);
506     Result.insert(Result.end(), Begin2, End2);
507     Result.insert(Result.end(), Begin3, End3);
508     return Result;
509   }
510 
511   const NodeT *getFirstNode() const { return *Begin1; }
512 
513   bool empty() const { return Begin1 == End1; }
514 
515   void append(ObjIter Begin, ObjIter End) {
516     if (Begin2 == End2) {
517       Begin2 = Begin;
518       End2 = End;
519       return;
520     }
521     assert(Begin3 == End3 && "cannot extend MergedVector");
522     Begin3 = Begin;
523     End3 = End;
524   }
525 
526 private:
527   ObjIter Begin1;
528   ObjIter End1;
529   ObjIter Begin2;
530   ObjIter End2;
531   ObjIter Begin3;
532   ObjIter End3;
533 };
534 
535 /// Merge two chains of nodes respecting a given 'type' and 'offset'.
536 ///
537 /// If MergeType == 0, then the result is a concatenation of two chains.
538 /// Otherwise, the first chain is cut into two sub-chains at the offset,
539 /// and merged using all possible ways of concatenating three chains.
540 MergedVector<NodeT> mergeNodes(const std::vector<NodeT *> &X,
541                                const std::vector<NodeT *> &Y,
542                                size_t MergeOffset, MergeTypeT MergeType) {
543   // Split the first chain, X, into X1 and X2.
544   MergedVector<NodeT>::ObjIter BeginX1 = X.begin();
545   MergedVector<NodeT>::ObjIter EndX1 = X.begin() + MergeOffset;
546   MergedVector<NodeT>::ObjIter BeginX2 = X.begin() + MergeOffset;
547   MergedVector<NodeT>::ObjIter EndX2 = X.end();
548   MergedVector<NodeT>::ObjIter BeginY = Y.begin();
549   MergedVector<NodeT>::ObjIter EndY = Y.end();
550 
551   // Construct a new chain from the three existing ones.
552   switch (MergeType) {
553   case MergeTypeT::X_Y:
554     return MergedVector<NodeT>(BeginX1, EndX2, BeginY, EndY);
555   case MergeTypeT::Y_X:
556     return MergedVector<NodeT>(BeginY, EndY, BeginX1, EndX2);
557   case MergeTypeT::X1_Y_X2:
558     return MergedVector<NodeT>(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2);
559   case MergeTypeT::Y_X2_X1:
560     return MergedVector<NodeT>(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1);
561   case MergeTypeT::X2_X1_Y:
562     return MergedVector<NodeT>(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY);
563   }
564   llvm_unreachable("unexpected chain merge type");
565 }
566 
567 /// The implementation of the ExtTSP algorithm.
568 class ExtTSPImpl {
569 public:
570   ExtTSPImpl(ArrayRef<uint64_t> NodeSizes, ArrayRef<uint64_t> NodeCounts,
571              ArrayRef<EdgeCount> EdgeCounts)
572       : NumNodes(NodeSizes.size()) {
573     initialize(NodeSizes, NodeCounts, EdgeCounts);
574   }
575 
576   /// Run the algorithm and return an optimized ordering of nodes.
577   std::vector<uint64_t> run() {
578     // Pass 1: Merge nodes with their mutually forced successors
579     mergeForcedPairs();
580 
581     // Pass 2: Merge pairs of chains while improving the ExtTSP objective
582     mergeChainPairs();
583 
584     // Pass 3: Merge cold nodes to reduce code size
585     mergeColdChains();
586 
587     // Collect nodes from all chains
588     return concatChains();
589   }
590 
591 private:
592   /// Initialize the algorithm's data structures.
593   void initialize(const ArrayRef<uint64_t> &NodeSizes,
594                   const ArrayRef<uint64_t> &NodeCounts,
595                   const ArrayRef<EdgeCount> &EdgeCounts) {
596     // Initialize nodes
597     AllNodes.reserve(NumNodes);
598     for (uint64_t Idx = 0; Idx < NumNodes; Idx++) {
599       uint64_t Size = std::max<uint64_t>(NodeSizes[Idx], 1ULL);
600       uint64_t ExecutionCount = NodeCounts[Idx];
601       // The execution count of the entry node is set to at least one.
602       if (Idx == 0 && ExecutionCount == 0)
603         ExecutionCount = 1;
604       AllNodes.emplace_back(Idx, Size, ExecutionCount);
605     }
606 
607     // Initialize jumps between nodes
608     SuccNodes.resize(NumNodes);
609     PredNodes.resize(NumNodes);
610     std::vector<uint64_t> OutDegree(NumNodes, 0);
611     AllJumps.reserve(EdgeCounts.size());
612     for (auto Edge : EdgeCounts) {
613       ++OutDegree[Edge.src];
614       // Ignore self-edges.
615       if (Edge.src == Edge.dst)
616         continue;
617 
618       SuccNodes[Edge.src].push_back(Edge.dst);
619       PredNodes[Edge.dst].push_back(Edge.src);
620       if (Edge.count > 0) {
621         NodeT &PredNode = AllNodes[Edge.src];
622         NodeT &SuccNode = AllNodes[Edge.dst];
623         AllJumps.emplace_back(&PredNode, &SuccNode, Edge.count);
624         SuccNode.InJumps.push_back(&AllJumps.back());
625         PredNode.OutJumps.push_back(&AllJumps.back());
626       }
627     }
628     for (JumpT &Jump : AllJumps) {
629       assert(OutDegree[Jump.Source->Index] > 0);
630       Jump.IsConditional = OutDegree[Jump.Source->Index] > 1;
631     }
632 
633     // Initialize chains.
634     AllChains.reserve(NumNodes);
635     HotChains.reserve(NumNodes);
636     for (NodeT &Node : AllNodes) {
637       // Adjust execution counts.
638       Node.ExecutionCount = std::max(Node.ExecutionCount, Node.inCount());
639       Node.ExecutionCount = std::max(Node.ExecutionCount, Node.outCount());
640       // Create a chain.
641       AllChains.emplace_back(Node.Index, &Node);
642       Node.CurChain = &AllChains.back();
643       if (Node.ExecutionCount > 0)
644         HotChains.push_back(&AllChains.back());
645     }
646 
647     // Initialize chain edges.
648     AllEdges.reserve(AllJumps.size());
649     for (NodeT &PredNode : AllNodes) {
650       for (JumpT *Jump : PredNode.OutJumps) {
651         NodeT *SuccNode = Jump->Target;
652         ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain);
653         // This edge is already present in the graph.
654         if (CurEdge != nullptr) {
655           assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr);
656           CurEdge->appendJump(Jump);
657           continue;
658         }
659         // This is a new edge.
660         AllEdges.emplace_back(Jump);
661         PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back());
662         SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back());
663       }
664     }
665   }
666 
667   /// For a pair of nodes, A and B, node B is the forced successor of A,
668   /// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps
669   /// to B are from A. Such nodes should be adjacent in the optimal ordering;
670   /// the method finds and merges such pairs of nodes.
671   void mergeForcedPairs() {
672     // Find forced pairs of blocks.
673     for (NodeT &Node : AllNodes) {
674       if (SuccNodes[Node.Index].size() == 1 &&
675           PredNodes[SuccNodes[Node.Index][0]].size() == 1 &&
676           SuccNodes[Node.Index][0] != 0) {
677         size_t SuccIndex = SuccNodes[Node.Index][0];
678         Node.ForcedSucc = &AllNodes[SuccIndex];
679         AllNodes[SuccIndex].ForcedPred = &Node;
680       }
681     }
682 
683     // There might be 'cycles' in the forced dependencies, since profile
684     // data isn't 100% accurate. Typically this is observed in loops, when the
685     // loop edges are the hottest successors for the basic blocks of the loop.
686     // Break the cycles by choosing the node with the smallest index as the
687     // head. This helps to keep the original order of the loops, which likely
688     // have already been rotated in the optimized manner.
689     for (NodeT &Node : AllNodes) {
690       if (Node.ForcedSucc == nullptr || Node.ForcedPred == nullptr)
691         continue;
692 
693       NodeT *SuccNode = Node.ForcedSucc;
694       while (SuccNode != nullptr && SuccNode != &Node) {
695         SuccNode = SuccNode->ForcedSucc;
696       }
697       if (SuccNode == nullptr)
698         continue;
699       // Break the cycle.
700       AllNodes[Node.ForcedPred->Index].ForcedSucc = nullptr;
701       Node.ForcedPred = nullptr;
702     }
703 
704     // Merge nodes with their fallthrough successors.
705     for (NodeT &Node : AllNodes) {
706       if (Node.ForcedPred == nullptr && Node.ForcedSucc != nullptr) {
707         const NodeT *CurBlock = &Node;
708         while (CurBlock->ForcedSucc != nullptr) {
709           const NodeT *NextBlock = CurBlock->ForcedSucc;
710           mergeChains(Node.CurChain, NextBlock->CurChain, 0, MergeTypeT::X_Y);
711           CurBlock = NextBlock;
712         }
713       }
714     }
715   }
716 
717   /// Merge pairs of chains while improving the ExtTSP objective.
718   void mergeChainPairs() {
719     /// Deterministically compare pairs of chains.
720     auto compareChainPairs = [](const ChainT *A1, const ChainT *B1,
721                                 const ChainT *A2, const ChainT *B2) {
722       return std::make_tuple(A1->Id, B1->Id) < std::make_tuple(A2->Id, B2->Id);
723     };
724 
725     double PrevScore = 1e9;
726     while (HotChains.size() > 1) {
727       ChainT *BestChainPred = nullptr;
728       ChainT *BestChainSucc = nullptr;
729       MergeGainT BestGain;
730       // Iterate over all pairs of chains.
731       for (ChainT *ChainPred : HotChains) {
732         // Since the score of merging doesn't increase, we can stop early when
733         // the newly found merge is as good as the previous one.
734         if (BestGain.score() == PrevScore)
735           break;
736         // Get candidates for merging with the current chain.
737         for (const auto &[ChainSucc, Edge] : ChainPred->Edges) {
738           // Ignore loop edges.
739           if (Edge->isSelfEdge())
740             continue;
741           // Stop early if the combined chain violates the maximum allowed size.
742           if (ChainPred->numBlocks() + ChainSucc->numBlocks() >= MaxChainSize)
743             continue;
744           // Don't merge the chains if they have vastly different densities.
745           // We stop early if the ratio between the densities exceeds
746           // MaxMergeDensityRatio. Smaller values of the option result in
747           // fewer merges (hence, more chains), which in turn typically yields
748           // smaller size of the hot code section.
749           double minDensity =
750               std::min(ChainPred->density(), ChainSucc->density());
751           double maxDensity =
752               std::max(ChainPred->density(), ChainSucc->density());
753           assert(minDensity > 0.0 && maxDensity > 0.0 &&
754                  "incorrectly computed chain densities");
755           const double Ratio = maxDensity / minDensity;
756           if (Ratio > MaxMergeDensityRatio)
757             continue;
758 
759           // Compute the gain of merging the two chains
760           MergeGainT CurGain = getBestMergeGain(ChainPred, ChainSucc, Edge);
761           if (CurGain.score() <= EPS)
762             continue;
763 
764           if (BestGain < CurGain ||
765               (std::abs(CurGain.score() - BestGain.score()) < EPS &&
766                compareChainPairs(ChainPred, ChainSucc, BestChainPred,
767                                  BestChainSucc))) {
768             BestGain = CurGain;
769             BestChainPred = ChainPred;
770             BestChainSucc = ChainSucc;
771             // Stop early when the merge is as good as the previous one.
772             if (BestGain.score() == PrevScore)
773               break;
774           }
775         }
776       }
777 
778       // Stop merging when there is no improvement.
779       if (BestGain.score() <= EPS)
780         break;
781 
782       // Merge the best pair of chains.
783       PrevScore = BestGain.score();
784       mergeChains(BestChainPred, BestChainSucc, BestGain.mergeOffset(),
785                   BestGain.mergeType());
786     }
787   }
788 
789   /// Merge remaining nodes into chains w/o taking jump counts into
790   /// consideration. This allows to maintain the original node order in the
791   /// absence of profile data.
792   void mergeColdChains() {
793     for (size_t SrcBB = 0; SrcBB < NumNodes; SrcBB++) {
794       // Iterating in reverse order to make sure original fallthrough jumps are
795       // merged first; this might be beneficial for code size.
796       size_t NumSuccs = SuccNodes[SrcBB].size();
797       for (size_t Idx = 0; Idx < NumSuccs; Idx++) {
798         size_t DstBB = SuccNodes[SrcBB][NumSuccs - Idx - 1];
799         ChainT *SrcChain = AllNodes[SrcBB].CurChain;
800         ChainT *DstChain = AllNodes[DstBB].CurChain;
801         if (SrcChain != DstChain && !DstChain->isEntry() &&
802             SrcChain->Nodes.back()->Index == SrcBB &&
803             DstChain->Nodes.front()->Index == DstBB &&
804             SrcChain->isCold() == DstChain->isCold()) {
805           mergeChains(SrcChain, DstChain, 0, MergeTypeT::X_Y);
806         }
807       }
808     }
809   }
810 
811   /// Compute the Ext-TSP score for a given node order and a list of jumps.
812   double extTSPScore(const MergedVector<NodeT> &Nodes,
813                      const MergedVector<JumpT> &Jumps) const {
814     if (Jumps.empty() || Nodes.empty())
815       return 0.0;
816 
817     uint64_t CurAddr = 0;
818     Nodes.forEach([&](const NodeT *Node) {
819       Node->EstimatedAddr = CurAddr;
820       CurAddr += Node->Size;
821     });
822 
823     double Score = 0;
824     Jumps.forEach([&](const JumpT *Jump) {
825       const NodeT *SrcBlock = Jump->Source;
826       const NodeT *DstBlock = Jump->Target;
827       Score += ::extTSPScore(SrcBlock->EstimatedAddr, SrcBlock->Size,
828                              DstBlock->EstimatedAddr, Jump->ExecutionCount,
829                              Jump->IsConditional);
830     });
831     return Score;
832   }
833 
834   /// Compute the gain of merging two chains.
835   ///
836   /// The function considers all possible ways of merging two chains and
837   /// computes the one having the largest increase in ExtTSP objective. The
838   /// result is a pair with the first element being the gain and the second
839   /// element being the corresponding merging type.
840   MergeGainT getBestMergeGain(ChainT *ChainPred, ChainT *ChainSucc,
841                               ChainEdge *Edge) const {
842     if (Edge->hasCachedMergeGain(ChainPred, ChainSucc))
843       return Edge->getCachedMergeGain(ChainPred, ChainSucc);
844 
845     // Precompute jumps between ChainPred and ChainSucc.
846     MergedVector<JumpT> Jumps(Edge->jumps().begin(), Edge->jumps().end());
847     assert(!Jumps.empty() && "trying to merge chains w/o jumps");
848     ChainEdge *EdgePP = ChainPred->getEdge(ChainPred);
849     if (EdgePP != nullptr)
850       Jumps.append(EdgePP->jumps().begin(), EdgePP->jumps().end());
851 
852     // This object holds the best chosen gain of merging two chains.
853     MergeGainT Gain = MergeGainT();
854 
855     /// Given a merge offset and a list of merge types, try to merge two chains
856     /// and update Gain with a better alternative.
857     auto tryChainMerging = [&](size_t Offset,
858                                const std::vector<MergeTypeT> &MergeTypes) {
859       // Skip merging corresponding to concatenation w/o splitting.
860       if (Offset == 0 || Offset == ChainPred->Nodes.size())
861         return;
862       // Skip merging if it breaks Forced successors.
863       NodeT *Node = ChainPred->Nodes[Offset - 1];
864       if (Node->ForcedSucc != nullptr)
865         return;
866       // Apply the merge, compute the corresponding gain, and update the best
867       // value, if the merge is beneficial.
868       for (const MergeTypeT &MergeType : MergeTypes) {
869         Gain.updateIfLessThan(
870             computeMergeGain(ChainPred, ChainSucc, Jumps, Offset, MergeType));
871       }
872     };
873 
874     // Try to concatenate two chains w/o splitting.
875     Gain.updateIfLessThan(
876         computeMergeGain(ChainPred, ChainSucc, Jumps, 0, MergeTypeT::X_Y));
877 
878     // Attach (a part of) ChainPred before the first node of ChainSucc.
879     for (JumpT *Jump : ChainSucc->Nodes.front()->InJumps) {
880       const NodeT *SrcBlock = Jump->Source;
881       if (SrcBlock->CurChain != ChainPred)
882         continue;
883       size_t Offset = SrcBlock->CurIndex + 1;
884       tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::X2_X1_Y});
885     }
886 
887     // Attach (a part of) ChainPred after the last node of ChainSucc.
888     for (JumpT *Jump : ChainSucc->Nodes.back()->OutJumps) {
889       const NodeT *DstBlock = Jump->Target;
890       if (DstBlock->CurChain != ChainPred)
891         continue;
892       size_t Offset = DstBlock->CurIndex;
893       tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1});
894     }
895 
896     // Try to break ChainPred in various ways and concatenate with ChainSucc.
897     // In practice, applying X2_Y_X1 merging is almost never provides benefits;
898     // thus, we exclude it from consideration to reduce the search space.
899     for (size_t Offset = 1; Offset < ChainPred->Nodes.size(); Offset++) {
900       // Do not split the chain along a jump.
901       const NodeT *BB = ChainPred->Nodes[Offset - 1];
902       const NodeT *BB2 = ChainPred->Nodes[Offset];
903       if (BB->isSuccessor(BB2))
904         continue;
905 
906       tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1,
907                                MergeTypeT::X2_X1_Y});
908     }
909     Edge->setCachedMergeGain(ChainPred, ChainSucc, Gain);
910     return Gain;
911   }
912 
913   /// Compute the score gain of merging two chains, respecting a given
914   /// merge 'type' and 'offset'.
915   ///
916   /// The two chains are not modified in the method.
917   MergeGainT computeMergeGain(const ChainT *ChainPred, const ChainT *ChainSucc,
918                               const MergedVector<JumpT> &Jumps,
919                               size_t MergeOffset, MergeTypeT MergeType) const {
920     MergedVector<NodeT> MergedNodes =
921         mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType);
922 
923     // Do not allow a merge that does not preserve the original entry point.
924     if ((ChainPred->isEntry() || ChainSucc->isEntry()) &&
925         !MergedNodes.getFirstNode()->isEntry())
926       return MergeGainT();
927 
928     // The gain for the new chain.
929     double NewScore = extTSPScore(MergedNodes, Jumps);
930     double CurScore = ChainPred->Score;
931     return MergeGainT(NewScore - CurScore, MergeOffset, MergeType);
932   }
933 
934   /// Merge chain From into chain Into, update the list of active chains,
935   /// adjacency information, and the corresponding cached values.
936   void mergeChains(ChainT *Into, ChainT *From, size_t MergeOffset,
937                    MergeTypeT MergeType) {
938     assert(Into != From && "a chain cannot be merged with itself");
939 
940     // Merge the nodes.
941     MergedVector<NodeT> MergedNodes =
942         mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType);
943     Into->merge(From, MergedNodes.getNodes());
944 
945     // Merge the edges.
946     Into->mergeEdges(From);
947     From->clear();
948 
949     // Update cached ext-tsp score for the new chain.
950     ChainEdge *SelfEdge = Into->getEdge(Into);
951     if (SelfEdge != nullptr) {
952       MergedNodes = MergedVector<NodeT>(Into->Nodes.begin(), Into->Nodes.end());
953       MergedVector<JumpT> MergedJumps(SelfEdge->jumps().begin(),
954                                       SelfEdge->jumps().end());
955       Into->Score = extTSPScore(MergedNodes, MergedJumps);
956     }
957 
958     // Remove the chain from the list of active chains.
959     llvm::erase_value(HotChains, From);
960 
961     // Invalidate caches.
962     for (auto EdgeIt : Into->Edges)
963       EdgeIt.second->invalidateCache();
964   }
965 
966   /// Concatenate all chains into the final order.
967   std::vector<uint64_t> concatChains() {
968     // Collect chains and calculate density stats for their sorting.
969     std::vector<const ChainT *> SortedChains;
970     DenseMap<const ChainT *, double> ChainDensity;
971     for (ChainT &Chain : AllChains) {
972       if (!Chain.Nodes.empty()) {
973         SortedChains.push_back(&Chain);
974         // Using doubles to avoid overflow of ExecutionCounts.
975         double Size = 0;
976         double ExecutionCount = 0;
977         for (NodeT *Node : Chain.Nodes) {
978           Size += static_cast<double>(Node->Size);
979           ExecutionCount += static_cast<double>(Node->ExecutionCount);
980         }
981         assert(Size > 0 && "a chain of zero size");
982         ChainDensity[&Chain] = ExecutionCount / Size;
983       }
984     }
985 
986     // Sorting chains by density in the decreasing order.
987     std::sort(SortedChains.begin(), SortedChains.end(),
988               [&](const ChainT *L, const ChainT *R) {
989                 // Place the entry point is at the beginning of the order.
990                 if (L->isEntry() != R->isEntry())
991                   return L->isEntry();
992 
993                 const double DL = ChainDensity[L];
994                 const double DR = ChainDensity[R];
995                 // Compare by density and break ties by chain identifiers.
996                 return std::make_tuple(-DL, L->Id) <
997                        std::make_tuple(-DR, R->Id);
998               });
999 
1000     // Collect the nodes in the order specified by their chains.
1001     std::vector<uint64_t> Order;
1002     Order.reserve(NumNodes);
1003     for (const ChainT *Chain : SortedChains)
1004       for (NodeT *Node : Chain->Nodes)
1005         Order.push_back(Node->Index);
1006     return Order;
1007   }
1008 
1009 private:
1010   /// The number of nodes in the graph.
1011   const size_t NumNodes;
1012 
1013   /// Successors of each node.
1014   std::vector<std::vector<uint64_t>> SuccNodes;
1015 
1016   /// Predecessors of each node.
1017   std::vector<std::vector<uint64_t>> PredNodes;
1018 
1019   /// All nodes (basic blocks) in the graph.
1020   std::vector<NodeT> AllNodes;
1021 
1022   /// All jumps between the nodes.
1023   std::vector<JumpT> AllJumps;
1024 
1025   /// All chains of nodes.
1026   std::vector<ChainT> AllChains;
1027 
1028   /// All edges between the chains.
1029   std::vector<ChainEdge> AllEdges;
1030 
1031   /// Active chains. The vector gets updated at runtime when chains are merged.
1032   std::vector<ChainT *> HotChains;
1033 };
1034 
1035 /// The implementation of the Cache-Directed Sort (CDS) algorithm for ordering
1036 /// functions represented by a call graph.
1037 class CDSortImpl {
1038 public:
1039   CDSortImpl(const CDSortConfig &Config, ArrayRef<uint64_t> NodeSizes,
1040              ArrayRef<uint64_t> NodeCounts, ArrayRef<EdgeCount> EdgeCounts,
1041              ArrayRef<uint64_t> EdgeOffsets)
1042       : Config(Config), NumNodes(NodeSizes.size()) {
1043     initialize(NodeSizes, NodeCounts, EdgeCounts, EdgeOffsets);
1044   }
1045 
1046   /// Run the algorithm and return an ordered set of function clusters.
1047   std::vector<uint64_t> run() {
1048     // Merge pairs of chains while improving the objective.
1049     mergeChainPairs();
1050 
1051     LLVM_DEBUG(dbgs() << "Cache-directed function sorting reduced the number"
1052                       << " of chains from " << NumNodes << " to "
1053                       << HotChains.size() << "\n");
1054 
1055     // Collect nodes from all the chains.
1056     return concatChains();
1057   }
1058 
1059 private:
1060   /// Initialize the algorithm's data structures.
1061   void initialize(const ArrayRef<uint64_t> &NodeSizes,
1062                   const ArrayRef<uint64_t> &NodeCounts,
1063                   const ArrayRef<EdgeCount> &EdgeCounts,
1064                   const ArrayRef<uint64_t> &EdgeOffsets) {
1065     // Initialize nodes.
1066     AllNodes.reserve(NumNodes);
1067     for (uint64_t Node = 0; Node < NumNodes; Node++) {
1068       uint64_t Size = std::max<uint64_t>(NodeSizes[Node], 1ULL);
1069       uint64_t ExecutionCount = NodeCounts[Node];
1070       AllNodes.emplace_back(Node, Size, ExecutionCount);
1071       TotalSamples += ExecutionCount;
1072       if (ExecutionCount > 0)
1073         TotalSize += Size;
1074     }
1075 
1076     // Initialize jumps between the nodes.
1077     SuccNodes.resize(NumNodes);
1078     PredNodes.resize(NumNodes);
1079     AllJumps.reserve(EdgeCounts.size());
1080     for (size_t I = 0; I < EdgeCounts.size(); I++) {
1081       auto [Pred, Succ, Count] = EdgeCounts[I];
1082       // Ignore recursive calls.
1083       if (Pred == Succ)
1084         continue;
1085 
1086       SuccNodes[Pred].push_back(Succ);
1087       PredNodes[Succ].push_back(Pred);
1088       if (Count > 0) {
1089         NodeT &PredNode = AllNodes[Pred];
1090         NodeT &SuccNode = AllNodes[Succ];
1091         AllJumps.emplace_back(&PredNode, &SuccNode, Count);
1092         AllJumps.back().Offset = EdgeOffsets[I];
1093         SuccNode.InJumps.push_back(&AllJumps.back());
1094         PredNode.OutJumps.push_back(&AllJumps.back());
1095       }
1096     }
1097 
1098     // Initialize chains.
1099     AllChains.reserve(NumNodes);
1100     HotChains.reserve(NumNodes);
1101     for (NodeT &Node : AllNodes) {
1102       // Adjust execution counts.
1103       Node.ExecutionCount = std::max(Node.ExecutionCount, Node.inCount());
1104       Node.ExecutionCount = std::max(Node.ExecutionCount, Node.outCount());
1105       // Create chain.
1106       AllChains.emplace_back(Node.Index, &Node);
1107       Node.CurChain = &AllChains.back();
1108       if (Node.ExecutionCount > 0)
1109         HotChains.push_back(&AllChains.back());
1110     }
1111 
1112     // Initialize chain edges.
1113     AllEdges.reserve(AllJumps.size());
1114     for (NodeT &PredNode : AllNodes) {
1115       for (JumpT *Jump : PredNode.OutJumps) {
1116         NodeT *SuccNode = Jump->Target;
1117         ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain);
1118         // this edge is already present in the graph.
1119         if (CurEdge != nullptr) {
1120           assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr);
1121           CurEdge->appendJump(Jump);
1122           continue;
1123         }
1124         // this is a new edge.
1125         AllEdges.emplace_back(Jump);
1126         PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back());
1127         SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back());
1128       }
1129     }
1130   }
1131 
1132   /// Merge pairs of chains while there is an improvement in the objective.
1133   void mergeChainPairs() {
1134     // Create a priority queue containing all edges ordered by the merge gain.
1135     auto GainComparator = [](ChainEdge *L, ChainEdge *R) {
1136       return std::make_tuple(-L->gain(), L->srcChain()->Id, L->dstChain()->Id) <
1137              std::make_tuple(-R->gain(), R->srcChain()->Id, R->dstChain()->Id);
1138     };
1139     std::set<ChainEdge *, decltype(GainComparator)> Queue(GainComparator);
1140 
1141     // Insert the edges into the queue.
1142     for (ChainT *ChainPred : HotChains) {
1143       for (const auto &[_, Edge] : ChainPred->Edges) {
1144         // Ignore self-edges.
1145         if (Edge->isSelfEdge())
1146           continue;
1147         // Ignore already processed edges.
1148         if (Edge->gain() != -1.0)
1149           continue;
1150 
1151         // Compute the gain of merging the two chains.
1152         MergeGainT Gain = getBestMergeGain(Edge);
1153         Edge->setMergeGain(Gain);
1154 
1155         if (Edge->gain() > EPS)
1156           Queue.insert(Edge);
1157       }
1158     }
1159 
1160     // Merge the chains while the gain of merging is positive.
1161     while (!Queue.empty()) {
1162       // Extract the best (top) edge for merging.
1163       ChainEdge *BestEdge = *Queue.begin();
1164       Queue.erase(Queue.begin());
1165       // Ignore self-edges.
1166       if (BestEdge->isSelfEdge())
1167         continue;
1168       // Ignore edges with non-positive gains.
1169       if (BestEdge->gain() <= EPS)
1170         continue;
1171 
1172       ChainT *BestSrcChain = BestEdge->srcChain();
1173       ChainT *BestDstChain = BestEdge->dstChain();
1174 
1175       // Remove outdated edges from the queue.
1176       for (const auto &[_, ChainEdge] : BestSrcChain->Edges)
1177         Queue.erase(ChainEdge);
1178       for (const auto &[_, ChainEdge] : BestDstChain->Edges)
1179         Queue.erase(ChainEdge);
1180 
1181       // Merge the best pair of chains.
1182       MergeGainT BestGain = BestEdge->getMergeGain();
1183       mergeChains(BestSrcChain, BestDstChain, BestGain.mergeOffset(),
1184                   BestGain.mergeType());
1185 
1186       // Insert newly created edges into the queue.
1187       for (const auto &[_, Edge] : BestSrcChain->Edges) {
1188         // Ignore loop edges.
1189         if (Edge->isSelfEdge())
1190           continue;
1191 
1192         // Compute the gain of merging the two chains.
1193         MergeGainT Gain = getBestMergeGain(Edge);
1194         Edge->setMergeGain(Gain);
1195 
1196         if (Edge->gain() > EPS)
1197           Queue.insert(Edge);
1198       }
1199     }
1200   }
1201 
1202   /// Compute the gain of merging two chains.
1203   ///
1204   /// The function considers all possible ways of merging two chains and
1205   /// computes the one having the largest increase in ExtTSP objective. The
1206   /// result is a pair with the first element being the gain and the second
1207   /// element being the corresponding merging type.
1208   MergeGainT getBestMergeGain(ChainEdge *Edge) const {
1209     // Precompute jumps between ChainPred and ChainSucc.
1210     auto Jumps = Edge->jumps();
1211     assert(!Jumps.empty() && "trying to merge chains w/o jumps");
1212     ChainT *SrcChain = Edge->srcChain();
1213     ChainT *DstChain = Edge->dstChain();
1214 
1215     // This object holds the best currently chosen gain of merging two chains.
1216     MergeGainT Gain = MergeGainT();
1217 
1218     /// Given a list of merge types, try to merge two chains and update Gain
1219     /// with a better alternative.
1220     auto tryChainMerging = [&](const std::vector<MergeTypeT> &MergeTypes) {
1221       // Apply the merge, compute the corresponding gain, and update the best
1222       // value, if the merge is beneficial.
1223       for (const MergeTypeT &MergeType : MergeTypes) {
1224         MergeGainT NewGain =
1225             computeMergeGain(SrcChain, DstChain, Jumps, MergeType);
1226 
1227         // When forward and backward gains are the same, prioritize merging that
1228         // preserves the original order of the functions in the binary.
1229         if (std::abs(Gain.score() - NewGain.score()) < EPS) {
1230           if ((MergeType == MergeTypeT::X_Y && SrcChain->Id < DstChain->Id) ||
1231               (MergeType == MergeTypeT::Y_X && SrcChain->Id > DstChain->Id)) {
1232             Gain = NewGain;
1233           }
1234         } else if (NewGain.score() > Gain.score() + EPS) {
1235           Gain = NewGain;
1236         }
1237       }
1238     };
1239 
1240     // Try to concatenate two chains w/o splitting.
1241     tryChainMerging({MergeTypeT::X_Y, MergeTypeT::Y_X});
1242 
1243     return Gain;
1244   }
1245 
1246   /// Compute the score gain of merging two chains, respecting a given type.
1247   ///
1248   /// The two chains are not modified in the method.
1249   MergeGainT computeMergeGain(ChainT *ChainPred, ChainT *ChainSucc,
1250                               const std::vector<JumpT *> &Jumps,
1251                               MergeTypeT MergeType) const {
1252     // This doesn't depend on the ordering of the nodes
1253     double FreqGain = freqBasedLocalityGain(ChainPred, ChainSucc);
1254 
1255     // Merge offset is always 0, as the chains are not split.
1256     size_t MergeOffset = 0;
1257     auto MergedBlocks =
1258         mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType);
1259     double DistGain = distBasedLocalityGain(MergedBlocks, Jumps);
1260 
1261     double GainScore = DistGain + Config.FrequencyScale * FreqGain;
1262     // Scale the result to increase the importance of merging short chains.
1263     if (GainScore >= 0.0)
1264       GainScore /= std::min(ChainPred->Size, ChainSucc->Size);
1265 
1266     return MergeGainT(GainScore, MergeOffset, MergeType);
1267   }
1268 
1269   /// Compute the change of the frequency locality after merging the chains.
1270   double freqBasedLocalityGain(ChainT *ChainPred, ChainT *ChainSucc) const {
1271     auto missProbability = [&](double ChainDensity) {
1272       double PageSamples = ChainDensity * Config.CacheSize;
1273       if (PageSamples >= TotalSamples)
1274         return 0.0;
1275       double P = PageSamples / TotalSamples;
1276       return pow(1.0 - P, static_cast<double>(Config.CacheEntries));
1277     };
1278 
1279     // Cache misses on the chains before merging.
1280     double CurScore =
1281         ChainPred->ExecutionCount * missProbability(ChainPred->density()) +
1282         ChainSucc->ExecutionCount * missProbability(ChainSucc->density());
1283 
1284     // Cache misses on the merged chain
1285     double MergedCounts = ChainPred->ExecutionCount + ChainSucc->ExecutionCount;
1286     double MergedSize = ChainPred->Size + ChainSucc->Size;
1287     double MergedDensity = static_cast<double>(MergedCounts) / MergedSize;
1288     double NewScore = MergedCounts * missProbability(MergedDensity);
1289 
1290     return CurScore - NewScore;
1291   }
1292 
1293   /// Compute the distance locality for a jump / call.
1294   double distScore(uint64_t SrcAddr, uint64_t DstAddr, uint64_t Count) const {
1295     uint64_t Dist = SrcAddr <= DstAddr ? DstAddr - SrcAddr : SrcAddr - DstAddr;
1296     double D = Dist == 0 ? 0.1 : static_cast<double>(Dist);
1297     return static_cast<double>(Count) * std::pow(D, -Config.DistancePower);
1298   }
1299 
1300   /// Compute the change of the distance locality after merging the chains.
1301   double distBasedLocalityGain(const MergedVector<NodeT> &MergedBlocks,
1302                                const std::vector<JumpT *> &Jumps) const {
1303     if (Jumps.empty())
1304       return 0.0;
1305     uint64_t CurAddr = 0;
1306     MergedBlocks.forEach([&](const NodeT *Node) {
1307       Node->EstimatedAddr = CurAddr;
1308       CurAddr += Node->Size;
1309     });
1310 
1311     double CurScore = 0;
1312     double NewScore = 0;
1313     for (const JumpT *Arc : Jumps) {
1314       uint64_t SrcAddr = Arc->Source->EstimatedAddr + Arc->Offset;
1315       uint64_t DstAddr = Arc->Target->EstimatedAddr;
1316       NewScore += distScore(SrcAddr, DstAddr, Arc->ExecutionCount);
1317       CurScore += distScore(0, TotalSize, Arc->ExecutionCount);
1318     }
1319     return NewScore - CurScore;
1320   }
1321 
1322   /// Merge chain From into chain Into, update the list of active chains,
1323   /// adjacency information, and the corresponding cached values.
1324   void mergeChains(ChainT *Into, ChainT *From, size_t MergeOffset,
1325                    MergeTypeT MergeType) {
1326     assert(Into != From && "a chain cannot be merged with itself");
1327 
1328     // Merge the nodes.
1329     MergedVector<NodeT> MergedNodes =
1330         mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType);
1331     Into->merge(From, MergedNodes.getNodes());
1332 
1333     // Merge the edges.
1334     Into->mergeEdges(From);
1335     From->clear();
1336 
1337     // Remove the chain from the list of active chains.
1338     llvm::erase_value(HotChains, From);
1339   }
1340 
1341   /// Concatenate all chains into the final order.
1342   std::vector<uint64_t> concatChains() {
1343     // Collect chains and calculate density stats for their sorting.
1344     std::vector<const ChainT *> SortedChains;
1345     DenseMap<const ChainT *, double> ChainDensity;
1346     for (ChainT &Chain : AllChains) {
1347       if (!Chain.Nodes.empty()) {
1348         SortedChains.push_back(&Chain);
1349         // Using doubles to avoid overflow of ExecutionCounts.
1350         double Size = 0;
1351         double ExecutionCount = 0;
1352         for (NodeT *Node : Chain.Nodes) {
1353           Size += static_cast<double>(Node->Size);
1354           ExecutionCount += static_cast<double>(Node->ExecutionCount);
1355         }
1356         assert(Size > 0 && "a chain of zero size");
1357         ChainDensity[&Chain] = ExecutionCount / Size;
1358       }
1359     }
1360 
1361     // Sort chains by density in the decreasing order.
1362     std::sort(SortedChains.begin(), SortedChains.end(),
1363               [&](const ChainT *L, const ChainT *R) {
1364                 const double DL = ChainDensity[L];
1365                 const double DR = ChainDensity[R];
1366                 // Compare by density and break ties by chain identifiers.
1367                 return std::make_tuple(-DL, L->Id) <
1368                        std::make_tuple(-DR, R->Id);
1369               });
1370 
1371     // Collect the nodes in the order specified by their chains.
1372     std::vector<uint64_t> Order;
1373     Order.reserve(NumNodes);
1374     for (const ChainT *Chain : SortedChains)
1375       for (NodeT *Node : Chain->Nodes)
1376         Order.push_back(Node->Index);
1377     return Order;
1378   }
1379 
1380 private:
1381   /// Config for the algorithm.
1382   const CDSortConfig Config;
1383 
1384   /// The number of nodes in the graph.
1385   const size_t NumNodes;
1386 
1387   /// Successors of each node.
1388   std::vector<std::vector<uint64_t>> SuccNodes;
1389 
1390   /// Predecessors of each node.
1391   std::vector<std::vector<uint64_t>> PredNodes;
1392 
1393   /// All nodes (functions) in the graph.
1394   std::vector<NodeT> AllNodes;
1395 
1396   /// All jumps (function calls) between the nodes.
1397   std::vector<JumpT> AllJumps;
1398 
1399   /// All chains of nodes.
1400   std::vector<ChainT> AllChains;
1401 
1402   /// All edges between the chains.
1403   std::vector<ChainEdge> AllEdges;
1404 
1405   /// Active chains. The vector gets updated at runtime when chains are merged.
1406   std::vector<ChainT *> HotChains;
1407 
1408   /// The total number of samples in the graph.
1409   uint64_t TotalSamples{0};
1410 
1411   /// The total size of the nodes in the graph.
1412   uint64_t TotalSize{0};
1413 };
1414 
1415 } // end of anonymous namespace
1416 
1417 std::vector<uint64_t>
1418 codelayout::computeExtTspLayout(ArrayRef<uint64_t> NodeSizes,
1419                                 ArrayRef<uint64_t> NodeCounts,
1420                                 ArrayRef<EdgeCount> EdgeCounts) {
1421   // Verify correctness of the input data.
1422   assert(NodeCounts.size() == NodeSizes.size() && "Incorrect input");
1423   assert(NodeSizes.size() > 2 && "Incorrect input");
1424 
1425   // Apply the reordering algorithm.
1426   ExtTSPImpl Alg(NodeSizes, NodeCounts, EdgeCounts);
1427   std::vector<uint64_t> Result = Alg.run();
1428 
1429   // Verify correctness of the output.
1430   assert(Result.front() == 0 && "Original entry point is not preserved");
1431   assert(Result.size() == NodeSizes.size() && "Incorrect size of layout");
1432   return Result;
1433 }
1434 
1435 double codelayout::calcExtTspScore(ArrayRef<uint64_t> Order,
1436                                    ArrayRef<uint64_t> NodeSizes,
1437                                    ArrayRef<uint64_t> NodeCounts,
1438                                    ArrayRef<EdgeCount> EdgeCounts) {
1439   // Estimate addresses of the blocks in memory.
1440   std::vector<uint64_t> Addr(NodeSizes.size(), 0);
1441   for (size_t Idx = 1; Idx < Order.size(); Idx++) {
1442     Addr[Order[Idx]] = Addr[Order[Idx - 1]] + NodeSizes[Order[Idx - 1]];
1443   }
1444   std::vector<uint64_t> OutDegree(NodeSizes.size(), 0);
1445   for (auto Edge : EdgeCounts)
1446     ++OutDegree[Edge.src];
1447 
1448   // Increase the score for each jump.
1449   double Score = 0;
1450   for (auto Edge : EdgeCounts) {
1451     bool IsConditional = OutDegree[Edge.src] > 1;
1452     Score += ::extTSPScore(Addr[Edge.src], NodeSizes[Edge.src], Addr[Edge.dst],
1453                            Edge.count, IsConditional);
1454   }
1455   return Score;
1456 }
1457 
1458 double codelayout::calcExtTspScore(ArrayRef<uint64_t> NodeSizes,
1459                                    ArrayRef<uint64_t> NodeCounts,
1460                                    ArrayRef<EdgeCount> EdgeCounts) {
1461   std::vector<uint64_t> Order(NodeSizes.size());
1462   for (size_t Idx = 0; Idx < NodeSizes.size(); Idx++) {
1463     Order[Idx] = Idx;
1464   }
1465   return calcExtTspScore(Order, NodeSizes, NodeCounts, EdgeCounts);
1466 }
1467 
1468 std::vector<uint64_t> codelayout::computeCacheDirectedLayout(
1469     const CDSortConfig &Config, ArrayRef<uint64_t> FuncSizes,
1470     ArrayRef<uint64_t> FuncCounts, ArrayRef<EdgeCount> CallCounts,
1471     ArrayRef<uint64_t> CallOffsets) {
1472   // Verify correctness of the input data.
1473   assert(FuncCounts.size() == FuncSizes.size() && "Incorrect input");
1474 
1475   // Apply the reordering algorithm.
1476   CDSortImpl Alg(Config, FuncSizes, FuncCounts, CallCounts, CallOffsets);
1477   std::vector<uint64_t> Result = Alg.run();
1478   assert(Result.size() == FuncSizes.size() && "Incorrect size of layout");
1479   return Result;
1480 }
1481 
1482 std::vector<uint64_t> codelayout::computeCacheDirectedLayout(
1483     ArrayRef<uint64_t> FuncSizes, ArrayRef<uint64_t> FuncCounts,
1484     ArrayRef<EdgeCount> CallCounts, ArrayRef<uint64_t> CallOffsets) {
1485   CDSortConfig Config;
1486   // Populate the config from the command-line options.
1487   if (CacheEntries.getNumOccurrences() > 0)
1488     Config.CacheEntries = CacheEntries;
1489   if (CacheSize.getNumOccurrences() > 0)
1490     Config.CacheSize = CacheSize;
1491   if (DistancePower.getNumOccurrences() > 0)
1492     Config.DistancePower = DistancePower;
1493   if (FrequencyScale.getNumOccurrences() > 0)
1494     Config.FrequencyScale = FrequencyScale;
1495   return computeCacheDirectedLayout(Config, FuncSizes, FuncCounts, CallCounts,
1496                                     CallOffsets);
1497 }
1498