xref: /llvm-project/llvm/lib/Transforms/Utils/CodeLayout.cpp (revision f61179f81277eec271d0c561de99a6c4bb7c370d)
1 //===- CodeLayout.cpp - Implementation of code layout algorithms ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // The file implements "cache-aware" layout algorithms of basic blocks and
10 // functions in a binary.
11 //
12 // The algorithm tries to find a layout of nodes (basic blocks) of a given CFG
13 // optimizing jump locality and thus processor I-cache utilization. This is
14 // achieved via increasing the number of fall-through jumps and co-locating
15 // frequently executed nodes together. The name follows the underlying
16 // optimization problem, Extended-TSP, which is a generalization of classical
17 // (maximum) Traveling Salesmen Problem.
18 //
19 // The algorithm is a greedy heuristic that works with chains (ordered lists)
20 // of basic blocks. Initially all chains are isolated basic blocks. On every
21 // iteration, we pick a pair of chains whose merging yields the biggest increase
22 // in the ExtTSP score, which models how i-cache "friendly" a specific chain is.
23 // A pair of chains giving the maximum gain is merged into a new chain. The
24 // procedure stops when there is only one chain left, or when merging does not
25 // increase ExtTSP. In the latter case, the remaining chains are sorted by
26 // density in the decreasing order.
27 //
28 // An important aspect is the way two chains are merged. Unlike earlier
29 // algorithms (e.g., based on the approach of Pettis-Hansen), two
30 // chains, X and Y, are first split into three, X1, X2, and Y. Then we
31 // consider all possible ways of gluing the three chains (e.g., X1YX2, X1X2Y,
32 // X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the largest score.
33 // This improves the quality of the final result (the search space is larger)
34 // while keeping the implementation sufficiently fast.
35 //
36 // Reference:
37 //   * A. Newell and S. Pupyrev, Improved Basic Block Reordering,
38 //     IEEE Transactions on Computers, 2020
39 //     https://arxiv.org/abs/1809.04676
40 //
41 //===----------------------------------------------------------------------===//
42 
43 #include "llvm/Transforms/Utils/CodeLayout.h"
44 #include "llvm/Support/CommandLine.h"
45 #include "llvm/Support/Debug.h"
46 
47 #include <cmath>
48 #include <set>
49 
50 using namespace llvm;
51 using namespace llvm::codelayout;
52 
53 #define DEBUG_TYPE "code-layout"
54 
55 namespace llvm {
56 cl::opt<bool> EnableExtTspBlockPlacement(
57     "enable-ext-tsp-block-placement", cl::Hidden, cl::init(false),
58     cl::desc("Enable machine block placement based on the ext-tsp model, "
59              "optimizing I-cache utilization."));
60 
61 cl::opt<bool> ApplyExtTspWithoutProfile(
62     "ext-tsp-apply-without-profile",
63     cl::desc("Whether to apply ext-tsp placement for instances w/o profile"),
64     cl::init(true), cl::Hidden);
65 } // namespace llvm
66 
67 // Algorithm-specific params for Ext-TSP. The values are tuned for the best
68 // performance of large-scale front-end bound binaries.
69 static cl::opt<double> ForwardWeightCond(
70     "ext-tsp-forward-weight-cond", cl::ReallyHidden, cl::init(0.1),
71     cl::desc("The weight of conditional forward jumps for ExtTSP value"));
72 
73 static cl::opt<double> ForwardWeightUncond(
74     "ext-tsp-forward-weight-uncond", cl::ReallyHidden, cl::init(0.1),
75     cl::desc("The weight of unconditional forward jumps for ExtTSP value"));
76 
77 static cl::opt<double> BackwardWeightCond(
78     "ext-tsp-backward-weight-cond", cl::ReallyHidden, cl::init(0.1),
79     cl::desc("The weight of conditional backward jumps for ExtTSP value"));
80 
81 static cl::opt<double> BackwardWeightUncond(
82     "ext-tsp-backward-weight-uncond", cl::ReallyHidden, cl::init(0.1),
83     cl::desc("The weight of unconditional backward jumps for ExtTSP value"));
84 
85 static cl::opt<double> FallthroughWeightCond(
86     "ext-tsp-fallthrough-weight-cond", cl::ReallyHidden, cl::init(1.0),
87     cl::desc("The weight of conditional fallthrough jumps for ExtTSP value"));
88 
89 static cl::opt<double> FallthroughWeightUncond(
90     "ext-tsp-fallthrough-weight-uncond", cl::ReallyHidden, cl::init(1.05),
91     cl::desc("The weight of unconditional fallthrough jumps for ExtTSP value"));
92 
93 static cl::opt<unsigned> ForwardDistance(
94     "ext-tsp-forward-distance", cl::ReallyHidden, cl::init(1024),
95     cl::desc("The maximum distance (in bytes) of a forward jump for ExtTSP"));
96 
97 static cl::opt<unsigned> BackwardDistance(
98     "ext-tsp-backward-distance", cl::ReallyHidden, cl::init(640),
99     cl::desc("The maximum distance (in bytes) of a backward jump for ExtTSP"));
100 
101 // The maximum size of a chain created by the algorithm. The size is bounded
102 // so that the algorithm can efficiently process extremely large instances.
103 static cl::opt<unsigned>
104     MaxChainSize("ext-tsp-max-chain-size", cl::ReallyHidden, cl::init(512),
105                  cl::desc("The maximum size of a chain to create"));
106 
107 // The maximum size of a chain for splitting. Larger values of the threshold
108 // may yield better quality at the cost of worsen run-time.
109 static cl::opt<unsigned> ChainSplitThreshold(
110     "ext-tsp-chain-split-threshold", cl::ReallyHidden, cl::init(128),
111     cl::desc("The maximum size of a chain to apply splitting"));
112 
113 // The maximum ratio between densities of two chains for merging.
114 static cl::opt<double> MaxMergeDensityRatio(
115     "ext-tsp-max-merge-density-ratio", cl::ReallyHidden, cl::init(100),
116     cl::desc("The maximum ratio between densities of two chains for merging"));
117 
118 // Algorithm-specific options for CDSort.
119 static cl::opt<unsigned> CacheEntries("cdsort-cache-entries", cl::ReallyHidden,
120                                       cl::desc("The size of the cache"));
121 
122 static cl::opt<unsigned> CacheSize("cdsort-cache-size", cl::ReallyHidden,
123                                    cl::desc("The size of a line in the cache"));
124 
125 static cl::opt<unsigned>
126     CDMaxChainSize("cdsort-max-chain-size", cl::ReallyHidden,
127                    cl::desc("The maximum size of a chain to create"));
128 
129 static cl::opt<double> DistancePower(
130     "cdsort-distance-power", cl::ReallyHidden,
131     cl::desc("The power exponent for the distance-based locality"));
132 
133 static cl::opt<double> FrequencyScale(
134     "cdsort-frequency-scale", cl::ReallyHidden,
135     cl::desc("The scale factor for the frequency-based locality"));
136 
137 namespace {
138 
139 // Epsilon for comparison of doubles.
140 constexpr double EPS = 1e-8;
141 
142 // Compute the Ext-TSP score for a given jump.
143 double jumpExtTSPScore(uint64_t JumpDist, uint64_t JumpMaxDist, uint64_t Count,
144                        double Weight) {
145   if (JumpDist > JumpMaxDist)
146     return 0;
147   double Prob = 1.0 - static_cast<double>(JumpDist) / JumpMaxDist;
148   return Weight * Prob * Count;
149 }
150 
151 // Compute the Ext-TSP score for a jump between a given pair of blocks,
152 // using their sizes, (estimated) addresses and the jump execution count.
153 double extTSPScore(uint64_t SrcAddr, uint64_t SrcSize, uint64_t DstAddr,
154                    uint64_t Count, bool IsConditional) {
155   // Fallthrough
156   if (SrcAddr + SrcSize == DstAddr) {
157     return jumpExtTSPScore(0, 1, Count,
158                            IsConditional ? FallthroughWeightCond
159                                          : FallthroughWeightUncond);
160   }
161   // Forward
162   if (SrcAddr + SrcSize < DstAddr) {
163     const uint64_t Dist = DstAddr - (SrcAddr + SrcSize);
164     return jumpExtTSPScore(Dist, ForwardDistance, Count,
165                            IsConditional ? ForwardWeightCond
166                                          : ForwardWeightUncond);
167   }
168   // Backward
169   const uint64_t Dist = SrcAddr + SrcSize - DstAddr;
170   return jumpExtTSPScore(Dist, BackwardDistance, Count,
171                          IsConditional ? BackwardWeightCond
172                                        : BackwardWeightUncond);
173 }
174 
175 /// A type of merging two chains, X and Y. The former chain is split into
176 /// X1 and X2 and then concatenated with Y in the order specified by the type.
177 enum class MergeTypeT : int { X_Y, Y_X, X1_Y_X2, Y_X2_X1, X2_X1_Y };
178 
179 /// The gain of merging two chains, that is, the Ext-TSP score of the merge
180 /// together with the corresponding merge 'type' and 'offset'.
181 struct MergeGainT {
182   explicit MergeGainT() = default;
183   explicit MergeGainT(double Score, size_t MergeOffset, MergeTypeT MergeType)
184       : Score(Score), MergeOffset(MergeOffset), MergeType(MergeType) {}
185 
186   double score() const { return Score; }
187 
188   size_t mergeOffset() const { return MergeOffset; }
189 
190   MergeTypeT mergeType() const { return MergeType; }
191 
192   void setMergeType(MergeTypeT Ty) { MergeType = Ty; }
193 
194   // Returns 'true' iff Other is preferred over this.
195   bool operator<(const MergeGainT &Other) const {
196     return (Other.Score > EPS && Other.Score > Score + EPS);
197   }
198 
199   // Update the current gain if Other is preferred over this.
200   void updateIfLessThan(const MergeGainT &Other) {
201     if (*this < Other)
202       *this = Other;
203   }
204 
205 private:
206   double Score{-1.0};
207   size_t MergeOffset{0};
208   MergeTypeT MergeType{MergeTypeT::X_Y};
209 };
210 
211 struct JumpT;
212 struct ChainT;
213 struct ChainEdge;
214 
215 /// A node in the graph, typically corresponding to a basic block in the CFG or
216 /// a function in the call graph.
217 struct NodeT {
218   NodeT(const NodeT &) = delete;
219   NodeT(NodeT &&) = default;
220   NodeT &operator=(const NodeT &) = delete;
221   NodeT &operator=(NodeT &&) = default;
222 
223   explicit NodeT(size_t Index, uint64_t Size, uint64_t Count)
224       : Index(Index), Size(Size), ExecutionCount(Count) {}
225 
226   bool isEntry() const { return Index == 0; }
227 
228   // Check if Other is a successor of the node.
229   bool isSuccessor(const NodeT *Other) const;
230 
231   // The total execution count of outgoing jumps.
232   uint64_t outCount() const;
233 
234   // The total execution count of incoming jumps.
235   uint64_t inCount() const;
236 
237   // The original index of the node in graph.
238   size_t Index{0};
239   // The index of the node in the current chain.
240   size_t CurIndex{0};
241   // The size of the node in the binary.
242   uint64_t Size{0};
243   // The execution count of the node in the profile data.
244   uint64_t ExecutionCount{0};
245   // The current chain of the node.
246   ChainT *CurChain{nullptr};
247   // The offset of the node in the current chain.
248   mutable uint64_t EstimatedAddr{0};
249   // Forced successor of the node in the graph.
250   NodeT *ForcedSucc{nullptr};
251   // Forced predecessor of the node in the graph.
252   NodeT *ForcedPred{nullptr};
253   // Outgoing jumps from the node.
254   std::vector<JumpT *> OutJumps;
255   // Incoming jumps to the node.
256   std::vector<JumpT *> InJumps;
257 };
258 
259 /// An arc in the graph, typically corresponding to a jump between two nodes.
260 struct JumpT {
261   JumpT(const JumpT &) = delete;
262   JumpT(JumpT &&) = default;
263   JumpT &operator=(const JumpT &) = delete;
264   JumpT &operator=(JumpT &&) = default;
265 
266   explicit JumpT(NodeT *Source, NodeT *Target, uint64_t ExecutionCount)
267       : Source(Source), Target(Target), ExecutionCount(ExecutionCount) {}
268 
269   // Source node of the jump.
270   NodeT *Source;
271   // Target node of the jump.
272   NodeT *Target;
273   // Execution count of the arc in the profile data.
274   uint64_t ExecutionCount{0};
275   // Whether the jump corresponds to a conditional branch.
276   bool IsConditional{false};
277   // The offset of the jump from the source node.
278   uint64_t Offset{0};
279 };
280 
281 /// A chain (ordered sequence) of nodes in the graph.
282 struct ChainT {
283   ChainT(const ChainT &) = delete;
284   ChainT(ChainT &&) = default;
285   ChainT &operator=(const ChainT &) = delete;
286   ChainT &operator=(ChainT &&) = default;
287 
288   explicit ChainT(uint64_t Id, NodeT *Node)
289       : Id(Id), ExecutionCount(Node->ExecutionCount), Size(Node->Size),
290         Nodes(1, Node) {}
291 
292   size_t numBlocks() const { return Nodes.size(); }
293 
294   double density() const { return ExecutionCount / Size; }
295 
296   bool isEntry() const { return Nodes[0]->Index == 0; }
297 
298   bool isCold() const {
299     for (NodeT *Node : Nodes) {
300       if (Node->ExecutionCount > 0)
301         return false;
302     }
303     return true;
304   }
305 
306   ChainEdge *getEdge(ChainT *Other) const {
307     for (const auto &[Chain, ChainEdge] : Edges) {
308       if (Chain == Other)
309         return ChainEdge;
310     }
311     return nullptr;
312   }
313 
314   void removeEdge(ChainT *Other) {
315     auto It = Edges.begin();
316     while (It != Edges.end()) {
317       if (It->first == Other) {
318         Edges.erase(It);
319         return;
320       }
321       It++;
322     }
323   }
324 
325   void addEdge(ChainT *Other, ChainEdge *Edge) {
326     Edges.push_back(std::make_pair(Other, Edge));
327   }
328 
329   void merge(ChainT *Other, std::vector<NodeT *> MergedBlocks) {
330     Nodes = std::move(MergedBlocks);
331     // Update the chain's data.
332     ExecutionCount += Other->ExecutionCount;
333     Size += Other->Size;
334     Id = Nodes[0]->Index;
335     // Update the node's data.
336     for (size_t Idx = 0; Idx < Nodes.size(); Idx++) {
337       Nodes[Idx]->CurChain = this;
338       Nodes[Idx]->CurIndex = Idx;
339     }
340   }
341 
342   void mergeEdges(ChainT *Other);
343 
344   void clear() {
345     Nodes.clear();
346     Nodes.shrink_to_fit();
347     Edges.clear();
348     Edges.shrink_to_fit();
349   }
350 
351   // Unique chain identifier.
352   uint64_t Id;
353   // Cached ext-tsp score for the chain.
354   double Score{0};
355   // The total execution count of the chain. Since the execution count of
356   // a basic block is uint64_t, using doubles here to avoid overflow.
357   double ExecutionCount{0};
358   // The total size of the chain.
359   uint64_t Size{0};
360   // Nodes of the chain.
361   std::vector<NodeT *> Nodes;
362   // Adjacent chains and corresponding edges (lists of jumps).
363   std::vector<std::pair<ChainT *, ChainEdge *>> Edges;
364 };
365 
366 /// An edge in the graph representing jumps between two chains.
367 /// When nodes are merged into chains, the edges are combined too so that
368 /// there is always at most one edge between a pair of chains.
369 struct ChainEdge {
370   ChainEdge(const ChainEdge &) = delete;
371   ChainEdge(ChainEdge &&) = default;
372   ChainEdge &operator=(const ChainEdge &) = delete;
373   ChainEdge &operator=(ChainEdge &&) = delete;
374 
375   explicit ChainEdge(JumpT *Jump)
376       : SrcChain(Jump->Source->CurChain), DstChain(Jump->Target->CurChain),
377         Jumps(1, Jump) {}
378 
379   ChainT *srcChain() const { return SrcChain; }
380 
381   ChainT *dstChain() const { return DstChain; }
382 
383   bool isSelfEdge() const { return SrcChain == DstChain; }
384 
385   const std::vector<JumpT *> &jumps() const { return Jumps; }
386 
387   void appendJump(JumpT *Jump) { Jumps.push_back(Jump); }
388 
389   void moveJumps(ChainEdge *Other) {
390     Jumps.insert(Jumps.end(), Other->Jumps.begin(), Other->Jumps.end());
391     Other->Jumps.clear();
392     Other->Jumps.shrink_to_fit();
393   }
394 
395   void changeEndpoint(ChainT *From, ChainT *To) {
396     if (From == SrcChain)
397       SrcChain = To;
398     if (From == DstChain)
399       DstChain = To;
400   }
401 
402   bool hasCachedMergeGain(ChainT *Src, ChainT *Dst) const {
403     return Src == SrcChain ? CacheValidForward : CacheValidBackward;
404   }
405 
406   MergeGainT getCachedMergeGain(ChainT *Src, ChainT *Dst) const {
407     return Src == SrcChain ? CachedGainForward : CachedGainBackward;
408   }
409 
410   void setCachedMergeGain(ChainT *Src, ChainT *Dst, MergeGainT MergeGain) {
411     if (Src == SrcChain) {
412       CachedGainForward = MergeGain;
413       CacheValidForward = true;
414     } else {
415       CachedGainBackward = MergeGain;
416       CacheValidBackward = true;
417     }
418   }
419 
420   void invalidateCache() {
421     CacheValidForward = false;
422     CacheValidBackward = false;
423   }
424 
425   void setMergeGain(MergeGainT Gain) { CachedGain = Gain; }
426 
427   MergeGainT getMergeGain() const { return CachedGain; }
428 
429   double gain() const { return CachedGain.score(); }
430 
431 private:
432   // Source chain.
433   ChainT *SrcChain{nullptr};
434   // Destination chain.
435   ChainT *DstChain{nullptr};
436   // Original jumps in the binary with corresponding execution counts.
437   std::vector<JumpT *> Jumps;
438   // Cached gain value for merging the pair of chains.
439   MergeGainT CachedGain;
440 
441   // Cached gain values for merging the pair of chains. Since the gain of
442   // merging (Src, Dst) and (Dst, Src) might be different, we store both values
443   // here and a flag indicating which of the options results in a higher gain.
444   // Cached gain values.
445   MergeGainT CachedGainForward;
446   MergeGainT CachedGainBackward;
447   // Whether the cached value must be recomputed.
448   bool CacheValidForward{false};
449   bool CacheValidBackward{false};
450 };
451 
452 bool NodeT::isSuccessor(const NodeT *Other) const {
453   for (JumpT *Jump : OutJumps)
454     if (Jump->Target == Other)
455       return true;
456   return false;
457 }
458 
459 uint64_t NodeT::outCount() const {
460   uint64_t Count = 0;
461   for (JumpT *Jump : OutJumps)
462     Count += Jump->ExecutionCount;
463   return Count;
464 }
465 
466 uint64_t NodeT::inCount() const {
467   uint64_t Count = 0;
468   for (JumpT *Jump : InJumps)
469     Count += Jump->ExecutionCount;
470   return Count;
471 }
472 
473 void ChainT::mergeEdges(ChainT *Other) {
474   // Update edges adjacent to chain Other.
475   for (const auto &[DstChain, DstEdge] : Other->Edges) {
476     ChainT *TargetChain = DstChain == Other ? this : DstChain;
477     ChainEdge *CurEdge = getEdge(TargetChain);
478     if (CurEdge == nullptr) {
479       DstEdge->changeEndpoint(Other, this);
480       this->addEdge(TargetChain, DstEdge);
481       if (DstChain != this && DstChain != Other)
482         DstChain->addEdge(this, DstEdge);
483     } else {
484       CurEdge->moveJumps(DstEdge);
485     }
486     // Cleanup leftover edge.
487     if (DstChain != Other)
488       DstChain->removeEdge(Other);
489   }
490 }
491 
492 using NodeIter = std::vector<NodeT *>::const_iterator;
493 static std::vector<NodeT *> EmptyList;
494 
495 /// A wrapper around three concatenated vectors (chains) of nodes; it is used
496 /// to avoid extra instantiation of the vectors.
497 struct MergedNodesT {
498   MergedNodesT(NodeIter Begin1, NodeIter End1,
499                NodeIter Begin2 = EmptyList.begin(),
500                NodeIter End2 = EmptyList.end(),
501                NodeIter Begin3 = EmptyList.begin(),
502                NodeIter End3 = EmptyList.end())
503       : Begin1(Begin1), End1(End1), Begin2(Begin2), End2(End2), Begin3(Begin3),
504         End3(End3) {}
505 
506   template <typename F> void forEach(const F &Func) const {
507     for (auto It = Begin1; It != End1; It++)
508       Func(*It);
509     for (auto It = Begin2; It != End2; It++)
510       Func(*It);
511     for (auto It = Begin3; It != End3; It++)
512       Func(*It);
513   }
514 
515   std::vector<NodeT *> getNodes() const {
516     std::vector<NodeT *> Result;
517     Result.reserve(std::distance(Begin1, End1) + std::distance(Begin2, End2) +
518                    std::distance(Begin3, End3));
519     Result.insert(Result.end(), Begin1, End1);
520     Result.insert(Result.end(), Begin2, End2);
521     Result.insert(Result.end(), Begin3, End3);
522     return Result;
523   }
524 
525   const NodeT *getFirstNode() const { return *Begin1; }
526 
527 private:
528   NodeIter Begin1;
529   NodeIter End1;
530   NodeIter Begin2;
531   NodeIter End2;
532   NodeIter Begin3;
533   NodeIter End3;
534 };
535 
536 /// A wrapper around two concatenated vectors (chains) of jumps.
537 struct MergedJumpsT {
538   MergedJumpsT(const std::vector<JumpT *> *Jumps1,
539                const std::vector<JumpT *> *Jumps2 = nullptr) {
540     assert(!Jumps1->empty() && "cannot merge empty jump list");
541     JumpArray[0] = Jumps1;
542     JumpArray[1] = Jumps2;
543   }
544 
545   template <typename F> void forEach(const F &Func) const {
546     for (auto Jumps : JumpArray)
547       if (Jumps != nullptr)
548         for (JumpT *Jump : *Jumps)
549           Func(Jump);
550   }
551 
552 private:
553   std::array<const std::vector<JumpT *> *, 2> JumpArray{nullptr, nullptr};
554 };
555 
556 /// Merge two chains of nodes respecting a given 'type' and 'offset'.
557 ///
558 /// If MergeType == 0, then the result is a concatenation of two chains.
559 /// Otherwise, the first chain is cut into two sub-chains at the offset,
560 /// and merged using all possible ways of concatenating three chains.
561 MergedNodesT mergeNodes(const std::vector<NodeT *> &X,
562                         const std::vector<NodeT *> &Y, size_t MergeOffset,
563                         MergeTypeT MergeType) {
564   // Split the first chain, X, into X1 and X2.
565   NodeIter BeginX1 = X.begin();
566   NodeIter EndX1 = X.begin() + MergeOffset;
567   NodeIter BeginX2 = X.begin() + MergeOffset;
568   NodeIter EndX2 = X.end();
569   NodeIter BeginY = Y.begin();
570   NodeIter EndY = Y.end();
571 
572   // Construct a new chain from the three existing ones.
573   switch (MergeType) {
574   case MergeTypeT::X_Y:
575     return MergedNodesT(BeginX1, EndX2, BeginY, EndY);
576   case MergeTypeT::Y_X:
577     return MergedNodesT(BeginY, EndY, BeginX1, EndX2);
578   case MergeTypeT::X1_Y_X2:
579     return MergedNodesT(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2);
580   case MergeTypeT::Y_X2_X1:
581     return MergedNodesT(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1);
582   case MergeTypeT::X2_X1_Y:
583     return MergedNodesT(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY);
584   }
585   llvm_unreachable("unexpected chain merge type");
586 }
587 
588 /// The implementation of the ExtTSP algorithm.
589 class ExtTSPImpl {
590 public:
591   ExtTSPImpl(ArrayRef<uint64_t> NodeSizes, ArrayRef<uint64_t> NodeCounts,
592              ArrayRef<EdgeCount> EdgeCounts)
593       : NumNodes(NodeSizes.size()) {
594     initialize(NodeSizes, NodeCounts, EdgeCounts);
595   }
596 
597   /// Run the algorithm and return an optimized ordering of nodes.
598   std::vector<uint64_t> run() {
599     // Pass 1: Merge nodes with their mutually forced successors
600     mergeForcedPairs();
601 
602     // Pass 2: Merge pairs of chains while improving the ExtTSP objective
603     mergeChainPairs();
604 
605     // Pass 3: Merge cold nodes to reduce code size
606     mergeColdChains();
607 
608     // Collect nodes from all chains
609     return concatChains();
610   }
611 
612 private:
613   /// Initialize the algorithm's data structures.
614   void initialize(const ArrayRef<uint64_t> &NodeSizes,
615                   const ArrayRef<uint64_t> &NodeCounts,
616                   const ArrayRef<EdgeCount> &EdgeCounts) {
617     // Initialize nodes
618     AllNodes.reserve(NumNodes);
619     for (uint64_t Idx = 0; Idx < NumNodes; Idx++) {
620       uint64_t Size = std::max<uint64_t>(NodeSizes[Idx], 1ULL);
621       uint64_t ExecutionCount = NodeCounts[Idx];
622       // The execution count of the entry node is set to at least one.
623       if (Idx == 0 && ExecutionCount == 0)
624         ExecutionCount = 1;
625       AllNodes.emplace_back(Idx, Size, ExecutionCount);
626     }
627 
628     // Initialize jumps between nodes
629     SuccNodes.resize(NumNodes);
630     PredNodes.resize(NumNodes);
631     std::vector<uint64_t> OutDegree(NumNodes, 0);
632     AllJumps.reserve(EdgeCounts.size());
633     for (auto Edge : EdgeCounts) {
634       ++OutDegree[Edge.src];
635       // Ignore self-edges.
636       if (Edge.src == Edge.dst)
637         continue;
638 
639       SuccNodes[Edge.src].push_back(Edge.dst);
640       PredNodes[Edge.dst].push_back(Edge.src);
641       if (Edge.count > 0) {
642         NodeT &PredNode = AllNodes[Edge.src];
643         NodeT &SuccNode = AllNodes[Edge.dst];
644         AllJumps.emplace_back(&PredNode, &SuccNode, Edge.count);
645         SuccNode.InJumps.push_back(&AllJumps.back());
646         PredNode.OutJumps.push_back(&AllJumps.back());
647       }
648     }
649     for (JumpT &Jump : AllJumps) {
650       assert(OutDegree[Jump.Source->Index] > 0 &&
651              "incorrectly computed out-degree of the block");
652       Jump.IsConditional = OutDegree[Jump.Source->Index] > 1;
653     }
654 
655     // Initialize chains.
656     AllChains.reserve(NumNodes);
657     HotChains.reserve(NumNodes);
658     for (NodeT &Node : AllNodes) {
659       // Create a chain.
660       AllChains.emplace_back(Node.Index, &Node);
661       Node.CurChain = &AllChains.back();
662       if (Node.ExecutionCount > 0)
663         HotChains.push_back(&AllChains.back());
664     }
665 
666     // Initialize chain edges.
667     AllEdges.reserve(AllJumps.size());
668     for (NodeT &PredNode : AllNodes) {
669       for (JumpT *Jump : PredNode.OutJumps) {
670         NodeT *SuccNode = Jump->Target;
671         ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain);
672         // This edge is already present in the graph.
673         if (CurEdge != nullptr) {
674           assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr);
675           CurEdge->appendJump(Jump);
676           continue;
677         }
678         // This is a new edge.
679         AllEdges.emplace_back(Jump);
680         PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back());
681         SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back());
682       }
683     }
684   }
685 
686   /// For a pair of nodes, A and B, node B is the forced successor of A,
687   /// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps
688   /// to B are from A. Such nodes should be adjacent in the optimal ordering;
689   /// the method finds and merges such pairs of nodes.
690   void mergeForcedPairs() {
691     // Find forced pairs of blocks.
692     for (NodeT &Node : AllNodes) {
693       if (SuccNodes[Node.Index].size() == 1 &&
694           PredNodes[SuccNodes[Node.Index][0]].size() == 1 &&
695           SuccNodes[Node.Index][0] != 0) {
696         size_t SuccIndex = SuccNodes[Node.Index][0];
697         Node.ForcedSucc = &AllNodes[SuccIndex];
698         AllNodes[SuccIndex].ForcedPred = &Node;
699       }
700     }
701 
702     // There might be 'cycles' in the forced dependencies, since profile
703     // data isn't 100% accurate. Typically this is observed in loops, when the
704     // loop edges are the hottest successors for the basic blocks of the loop.
705     // Break the cycles by choosing the node with the smallest index as the
706     // head. This helps to keep the original order of the loops, which likely
707     // have already been rotated in the optimized manner.
708     for (NodeT &Node : AllNodes) {
709       if (Node.ForcedSucc == nullptr || Node.ForcedPred == nullptr)
710         continue;
711 
712       NodeT *SuccNode = Node.ForcedSucc;
713       while (SuccNode != nullptr && SuccNode != &Node) {
714         SuccNode = SuccNode->ForcedSucc;
715       }
716       if (SuccNode == nullptr)
717         continue;
718       // Break the cycle.
719       AllNodes[Node.ForcedPred->Index].ForcedSucc = nullptr;
720       Node.ForcedPred = nullptr;
721     }
722 
723     // Merge nodes with their fallthrough successors.
724     for (NodeT &Node : AllNodes) {
725       if (Node.ForcedPred == nullptr && Node.ForcedSucc != nullptr) {
726         const NodeT *CurBlock = &Node;
727         while (CurBlock->ForcedSucc != nullptr) {
728           const NodeT *NextBlock = CurBlock->ForcedSucc;
729           mergeChains(Node.CurChain, NextBlock->CurChain, 0, MergeTypeT::X_Y);
730           CurBlock = NextBlock;
731         }
732       }
733     }
734   }
735 
736   /// Merge pairs of chains while improving the ExtTSP objective.
737   void mergeChainPairs() {
738     /// Deterministically compare pairs of chains.
739     auto compareChainPairs = [](const ChainT *A1, const ChainT *B1,
740                                 const ChainT *A2, const ChainT *B2) {
741       return std::make_tuple(A1->Id, B1->Id) < std::make_tuple(A2->Id, B2->Id);
742     };
743 
744     while (HotChains.size() > 1) {
745       ChainT *BestChainPred = nullptr;
746       ChainT *BestChainSucc = nullptr;
747       MergeGainT BestGain;
748       // Iterate over all pairs of chains.
749       for (ChainT *ChainPred : HotChains) {
750         // Get candidates for merging with the current chain.
751         for (const auto &[ChainSucc, Edge] : ChainPred->Edges) {
752           // Ignore loop edges.
753           if (Edge->isSelfEdge())
754             continue;
755           // Skip the merge if the combined chain violates the maximum specified
756           // size.
757           if (ChainPred->numBlocks() + ChainSucc->numBlocks() >= MaxChainSize)
758             continue;
759           // Don't merge the chains if they have vastly different densities.
760           // Skip the merge if the ratio between the densities exceeds
761           // MaxMergeDensityRatio. Smaller values of the option result in fewer
762           // merges, and hence, more chains.
763           auto ChainPredDensity = ChainPred->density();
764           auto ChainSuccDensity = ChainSucc->density();
765           auto [minDensity, maxDensity] =
766               std::minmax(ChainPredDensity, ChainSuccDensity);
767           assert(minDensity > 0.0 && maxDensity > 0.0 &&
768                  "incorrectly computed chain densities");
769           const double Ratio = maxDensity / minDensity;
770           if (Ratio > MaxMergeDensityRatio)
771             continue;
772 
773           // Compute the gain of merging the two chains.
774           MergeGainT CurGain = getBestMergeGain(ChainPred, ChainSucc, Edge);
775           if (CurGain.score() <= EPS)
776             continue;
777 
778           if (BestGain < CurGain ||
779               (std::abs(CurGain.score() - BestGain.score()) < EPS &&
780                compareChainPairs(ChainPred, ChainSucc, BestChainPred,
781                                  BestChainSucc))) {
782             BestGain = CurGain;
783             BestChainPred = ChainPred;
784             BestChainSucc = ChainSucc;
785           }
786         }
787       }
788 
789       // Stop merging when there is no improvement.
790       if (BestGain.score() <= EPS)
791         break;
792 
793       // Merge the best pair of chains.
794       mergeChains(BestChainPred, BestChainSucc, BestGain.mergeOffset(),
795                   BestGain.mergeType());
796     }
797   }
798 
799   /// Merge remaining nodes into chains w/o taking jump counts into
800   /// consideration. This allows to maintain the original node order in the
801   /// absence of profile data.
802   void mergeColdChains() {
803     for (size_t SrcBB = 0; SrcBB < NumNodes; SrcBB++) {
804       // Iterating in reverse order to make sure original fallthrough jumps are
805       // merged first; this might be beneficial for code size.
806       size_t NumSuccs = SuccNodes[SrcBB].size();
807       for (size_t Idx = 0; Idx < NumSuccs; Idx++) {
808         size_t DstBB = SuccNodes[SrcBB][NumSuccs - Idx - 1];
809         ChainT *SrcChain = AllNodes[SrcBB].CurChain;
810         ChainT *DstChain = AllNodes[DstBB].CurChain;
811         if (SrcChain != DstChain && !DstChain->isEntry() &&
812             SrcChain->Nodes.back()->Index == SrcBB &&
813             DstChain->Nodes.front()->Index == DstBB &&
814             SrcChain->isCold() == DstChain->isCold()) {
815           mergeChains(SrcChain, DstChain, 0, MergeTypeT::X_Y);
816         }
817       }
818     }
819   }
820 
821   /// Compute the Ext-TSP score for a given node order and a list of jumps.
822   double extTSPScore(const MergedNodesT &Nodes,
823                      const MergedJumpsT &Jumps) const {
824     uint64_t CurAddr = 0;
825     Nodes.forEach([&](const NodeT *Node) {
826       Node->EstimatedAddr = CurAddr;
827       CurAddr += Node->Size;
828     });
829 
830     double Score = 0;
831     Jumps.forEach([&](const JumpT *Jump) {
832       const NodeT *SrcBlock = Jump->Source;
833       const NodeT *DstBlock = Jump->Target;
834       Score += ::extTSPScore(SrcBlock->EstimatedAddr, SrcBlock->Size,
835                              DstBlock->EstimatedAddr, Jump->ExecutionCount,
836                              Jump->IsConditional);
837     });
838     return Score;
839   }
840 
841   /// Compute the gain of merging two chains.
842   ///
843   /// The function considers all possible ways of merging two chains and
844   /// computes the one having the largest increase in ExtTSP objective. The
845   /// result is a pair with the first element being the gain and the second
846   /// element being the corresponding merging type.
847   MergeGainT getBestMergeGain(ChainT *ChainPred, ChainT *ChainSucc,
848                               ChainEdge *Edge) const {
849     if (Edge->hasCachedMergeGain(ChainPred, ChainSucc))
850       return Edge->getCachedMergeGain(ChainPred, ChainSucc);
851 
852     assert(!Edge->jumps().empty() && "trying to merge chains w/o jumps");
853     // Precompute jumps between ChainPred and ChainSucc.
854     ChainEdge *EdgePP = ChainPred->getEdge(ChainPred);
855     MergedJumpsT Jumps(&Edge->jumps(), EdgePP ? &EdgePP->jumps() : nullptr);
856 
857     // This object holds the best chosen gain of merging two chains.
858     MergeGainT Gain = MergeGainT();
859 
860     /// Given a merge offset and a list of merge types, try to merge two chains
861     /// and update Gain with a better alternative.
862     auto tryChainMerging = [&](size_t Offset,
863                                const std::vector<MergeTypeT> &MergeTypes) {
864       // Skip merging corresponding to concatenation w/o splitting.
865       if (Offset == 0 || Offset == ChainPred->Nodes.size())
866         return;
867       // Skip merging if it breaks Forced successors.
868       NodeT *Node = ChainPred->Nodes[Offset - 1];
869       if (Node->ForcedSucc != nullptr)
870         return;
871       // Apply the merge, compute the corresponding gain, and update the best
872       // value, if the merge is beneficial.
873       for (const MergeTypeT &MergeType : MergeTypes) {
874         Gain.updateIfLessThan(
875             computeMergeGain(ChainPred, ChainSucc, Jumps, Offset, MergeType));
876       }
877     };
878 
879     // Try to concatenate two chains w/o splitting.
880     Gain.updateIfLessThan(
881         computeMergeGain(ChainPred, ChainSucc, Jumps, 0, MergeTypeT::X_Y));
882 
883     // Attach (a part of) ChainPred before the first node of ChainSucc.
884     for (JumpT *Jump : ChainSucc->Nodes.front()->InJumps) {
885       const NodeT *SrcBlock = Jump->Source;
886       if (SrcBlock->CurChain != ChainPred)
887         continue;
888       size_t Offset = SrcBlock->CurIndex + 1;
889       tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::X2_X1_Y});
890     }
891 
892     // Attach (a part of) ChainPred after the last node of ChainSucc.
893     for (JumpT *Jump : ChainSucc->Nodes.back()->OutJumps) {
894       const NodeT *DstBlock = Jump->Target;
895       if (DstBlock->CurChain != ChainPred)
896         continue;
897       size_t Offset = DstBlock->CurIndex;
898       tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1});
899     }
900 
901     // Try to break ChainPred in various ways and concatenate with ChainSucc.
902     if (ChainPred->Nodes.size() <= ChainSplitThreshold) {
903       for (size_t Offset = 1; Offset < ChainPred->Nodes.size(); Offset++) {
904         // Do not split the chain along a fall-through jump. One of the two
905         // loops above may still "break" such a jump whenever it results in a
906         // new fall-through.
907         const NodeT *BB = ChainPred->Nodes[Offset - 1];
908         const NodeT *BB2 = ChainPred->Nodes[Offset];
909         if (BB->isSuccessor(BB2))
910           continue;
911 
912         // In practice, applying X2_Y_X1 merging almost never provides benefits;
913         // thus, we exclude it from consideration to reduce the search space.
914         tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1,
915                                  MergeTypeT::X2_X1_Y});
916       }
917     }
918 
919     Edge->setCachedMergeGain(ChainPred, ChainSucc, Gain);
920     return Gain;
921   }
922 
923   /// Compute the score gain of merging two chains, respecting a given
924   /// merge 'type' and 'offset'.
925   ///
926   /// The two chains are not modified in the method.
927   MergeGainT computeMergeGain(const ChainT *ChainPred, const ChainT *ChainSucc,
928                               const MergedJumpsT &Jumps, size_t MergeOffset,
929                               MergeTypeT MergeType) const {
930     MergedNodesT MergedNodes =
931         mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType);
932 
933     // Do not allow a merge that does not preserve the original entry point.
934     if ((ChainPred->isEntry() || ChainSucc->isEntry()) &&
935         !MergedNodes.getFirstNode()->isEntry())
936       return MergeGainT();
937 
938     // The gain for the new chain.
939     double NewScore = extTSPScore(MergedNodes, Jumps);
940     double CurScore = ChainPred->Score;
941     return MergeGainT(NewScore - CurScore, MergeOffset, MergeType);
942   }
943 
944   /// Merge chain From into chain Into, update the list of active chains,
945   /// adjacency information, and the corresponding cached values.
946   void mergeChains(ChainT *Into, ChainT *From, size_t MergeOffset,
947                    MergeTypeT MergeType) {
948     assert(Into != From && "a chain cannot be merged with itself");
949 
950     // Merge the nodes.
951     MergedNodesT MergedNodes =
952         mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType);
953     Into->merge(From, MergedNodes.getNodes());
954 
955     // Merge the edges.
956     Into->mergeEdges(From);
957     From->clear();
958 
959     // Update cached ext-tsp score for the new chain.
960     ChainEdge *SelfEdge = Into->getEdge(Into);
961     if (SelfEdge != nullptr) {
962       MergedNodes = MergedNodesT(Into->Nodes.begin(), Into->Nodes.end());
963       MergedJumpsT MergedJumps(&SelfEdge->jumps());
964       Into->Score = extTSPScore(MergedNodes, MergedJumps);
965     }
966 
967     // Remove the chain from the list of active chains.
968     llvm::erase(HotChains, From);
969 
970     // Invalidate caches.
971     for (auto EdgeIt : Into->Edges)
972       EdgeIt.second->invalidateCache();
973   }
974 
975   /// Concatenate all chains into the final order.
976   std::vector<uint64_t> concatChains() {
977     // Collect non-empty chains.
978     std::vector<const ChainT *> SortedChains;
979     for (ChainT &Chain : AllChains) {
980       if (!Chain.Nodes.empty())
981         SortedChains.push_back(&Chain);
982     }
983 
984     // Sorting chains by density in the decreasing order.
985     std::sort(SortedChains.begin(), SortedChains.end(),
986               [&](const ChainT *L, const ChainT *R) {
987                 // Place the entry point at the beginning of the order.
988                 if (L->isEntry() != R->isEntry())
989                   return L->isEntry();
990 
991                 // Compare by density and break ties by chain identifiers.
992                 return std::make_tuple(-L->density(), L->Id) <
993                        std::make_tuple(-R->density(), R->Id);
994               });
995 
996     // Collect the nodes in the order specified by their chains.
997     std::vector<uint64_t> Order;
998     Order.reserve(NumNodes);
999     for (const ChainT *Chain : SortedChains)
1000       for (NodeT *Node : Chain->Nodes)
1001         Order.push_back(Node->Index);
1002     return Order;
1003   }
1004 
1005 private:
1006   /// The number of nodes in the graph.
1007   const size_t NumNodes;
1008 
1009   /// Successors of each node.
1010   std::vector<std::vector<uint64_t>> SuccNodes;
1011 
1012   /// Predecessors of each node.
1013   std::vector<std::vector<uint64_t>> PredNodes;
1014 
1015   /// All nodes (basic blocks) in the graph.
1016   std::vector<NodeT> AllNodes;
1017 
1018   /// All jumps between the nodes.
1019   std::vector<JumpT> AllJumps;
1020 
1021   /// All chains of nodes.
1022   std::vector<ChainT> AllChains;
1023 
1024   /// All edges between the chains.
1025   std::vector<ChainEdge> AllEdges;
1026 
1027   /// Active chains. The vector gets updated at runtime when chains are merged.
1028   std::vector<ChainT *> HotChains;
1029 };
1030 
1031 /// The implementation of the Cache-Directed Sort (CDSort) algorithm for
1032 /// ordering functions represented by a call graph.
1033 class CDSortImpl {
1034 public:
1035   CDSortImpl(const CDSortConfig &Config, ArrayRef<uint64_t> NodeSizes,
1036              ArrayRef<uint64_t> NodeCounts, ArrayRef<EdgeCount> EdgeCounts,
1037              ArrayRef<uint64_t> EdgeOffsets)
1038       : Config(Config), NumNodes(NodeSizes.size()) {
1039     initialize(NodeSizes, NodeCounts, EdgeCounts, EdgeOffsets);
1040   }
1041 
1042   /// Run the algorithm and return an ordered set of function clusters.
1043   std::vector<uint64_t> run() {
1044     // Merge pairs of chains while improving the objective.
1045     mergeChainPairs();
1046 
1047     // Collect nodes from all the chains.
1048     return concatChains();
1049   }
1050 
1051 private:
1052   /// Initialize the algorithm's data structures.
1053   void initialize(const ArrayRef<uint64_t> &NodeSizes,
1054                   const ArrayRef<uint64_t> &NodeCounts,
1055                   const ArrayRef<EdgeCount> &EdgeCounts,
1056                   const ArrayRef<uint64_t> &EdgeOffsets) {
1057     // Initialize nodes.
1058     AllNodes.reserve(NumNodes);
1059     for (uint64_t Node = 0; Node < NumNodes; Node++) {
1060       uint64_t Size = std::max<uint64_t>(NodeSizes[Node], 1ULL);
1061       uint64_t ExecutionCount = NodeCounts[Node];
1062       AllNodes.emplace_back(Node, Size, ExecutionCount);
1063       TotalSamples += ExecutionCount;
1064       if (ExecutionCount > 0)
1065         TotalSize += Size;
1066     }
1067 
1068     // Initialize jumps between the nodes.
1069     SuccNodes.resize(NumNodes);
1070     PredNodes.resize(NumNodes);
1071     AllJumps.reserve(EdgeCounts.size());
1072     for (size_t I = 0; I < EdgeCounts.size(); I++) {
1073       auto [Pred, Succ, Count] = EdgeCounts[I];
1074       // Ignore recursive calls.
1075       if (Pred == Succ)
1076         continue;
1077 
1078       SuccNodes[Pred].push_back(Succ);
1079       PredNodes[Succ].push_back(Pred);
1080       if (Count > 0) {
1081         NodeT &PredNode = AllNodes[Pred];
1082         NodeT &SuccNode = AllNodes[Succ];
1083         AllJumps.emplace_back(&PredNode, &SuccNode, Count);
1084         AllJumps.back().Offset = EdgeOffsets[I];
1085         SuccNode.InJumps.push_back(&AllJumps.back());
1086         PredNode.OutJumps.push_back(&AllJumps.back());
1087       }
1088     }
1089 
1090     // Initialize chains.
1091     AllChains.reserve(NumNodes);
1092     for (NodeT &Node : AllNodes) {
1093       // Adjust execution counts.
1094       Node.ExecutionCount = std::max(Node.ExecutionCount, Node.inCount());
1095       Node.ExecutionCount = std::max(Node.ExecutionCount, Node.outCount());
1096       // Create chain.
1097       AllChains.emplace_back(Node.Index, &Node);
1098       Node.CurChain = &AllChains.back();
1099     }
1100 
1101     // Initialize chain edges.
1102     AllEdges.reserve(AllJumps.size());
1103     for (NodeT &PredNode : AllNodes) {
1104       for (JumpT *Jump : PredNode.OutJumps) {
1105         NodeT *SuccNode = Jump->Target;
1106         ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain);
1107         // this edge is already present in the graph.
1108         if (CurEdge != nullptr) {
1109           assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr);
1110           CurEdge->appendJump(Jump);
1111           continue;
1112         }
1113         // this is a new edge.
1114         AllEdges.emplace_back(Jump);
1115         PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back());
1116         SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back());
1117       }
1118     }
1119   }
1120 
1121   /// Merge pairs of chains while there is an improvement in the objective.
1122   void mergeChainPairs() {
1123     // Create a priority queue containing all edges ordered by the merge gain.
1124     auto GainComparator = [](ChainEdge *L, ChainEdge *R) {
1125       return std::make_tuple(-L->gain(), L->srcChain()->Id, L->dstChain()->Id) <
1126              std::make_tuple(-R->gain(), R->srcChain()->Id, R->dstChain()->Id);
1127     };
1128     std::set<ChainEdge *, decltype(GainComparator)> Queue(GainComparator);
1129 
1130     // Insert the edges into the queue.
1131     [[maybe_unused]] size_t NumActiveChains = 0;
1132     for (NodeT &Node : AllNodes) {
1133       if (Node.ExecutionCount == 0)
1134         continue;
1135       ++NumActiveChains;
1136       for (const auto &[_, Edge] : Node.CurChain->Edges) {
1137         // Ignore self-edges.
1138         if (Edge->isSelfEdge())
1139           continue;
1140         // Ignore already processed edges.
1141         if (Edge->gain() != -1.0)
1142           continue;
1143 
1144         // Compute the gain of merging the two chains.
1145         MergeGainT Gain = getBestMergeGain(Edge);
1146         Edge->setMergeGain(Gain);
1147 
1148         if (Edge->gain() > EPS)
1149           Queue.insert(Edge);
1150       }
1151     }
1152 
1153     // Merge the chains while the gain of merging is positive.
1154     while (!Queue.empty()) {
1155       // Extract the best (top) edge for merging.
1156       ChainEdge *BestEdge = *Queue.begin();
1157       Queue.erase(Queue.begin());
1158       ChainT *BestSrcChain = BestEdge->srcChain();
1159       ChainT *BestDstChain = BestEdge->dstChain();
1160 
1161       // Remove outdated edges from the queue.
1162       for (const auto &[_, ChainEdge] : BestSrcChain->Edges)
1163         Queue.erase(ChainEdge);
1164       for (const auto &[_, ChainEdge] : BestDstChain->Edges)
1165         Queue.erase(ChainEdge);
1166 
1167       // Merge the best pair of chains.
1168       MergeGainT BestGain = BestEdge->getMergeGain();
1169       mergeChains(BestSrcChain, BestDstChain, BestGain.mergeOffset(),
1170                   BestGain.mergeType());
1171       --NumActiveChains;
1172 
1173       // Insert newly created edges into the queue.
1174       for (const auto &[_, Edge] : BestSrcChain->Edges) {
1175         // Ignore loop edges.
1176         if (Edge->isSelfEdge())
1177           continue;
1178         if (Edge->srcChain()->numBlocks() + Edge->dstChain()->numBlocks() >
1179             Config.MaxChainSize)
1180           continue;
1181 
1182         // Compute the gain of merging the two chains.
1183         MergeGainT Gain = getBestMergeGain(Edge);
1184         Edge->setMergeGain(Gain);
1185 
1186         if (Edge->gain() > EPS)
1187           Queue.insert(Edge);
1188       }
1189     }
1190 
1191     LLVM_DEBUG(dbgs() << "Cache-directed function sorting reduced the number"
1192                       << " of chains from " << NumNodes << " to "
1193                       << NumActiveChains << "\n");
1194   }
1195 
1196   /// Compute the gain of merging two chains.
1197   ///
1198   /// The function considers all possible ways of merging two chains and
1199   /// computes the one having the largest increase in ExtTSP objective. The
1200   /// result is a pair with the first element being the gain and the second
1201   /// element being the corresponding merging type.
1202   MergeGainT getBestMergeGain(ChainEdge *Edge) const {
1203     assert(!Edge->jumps().empty() && "trying to merge chains w/o jumps");
1204     // Precompute jumps between ChainPred and ChainSucc.
1205     MergedJumpsT Jumps(&Edge->jumps());
1206     ChainT *SrcChain = Edge->srcChain();
1207     ChainT *DstChain = Edge->dstChain();
1208 
1209     // This object holds the best currently chosen gain of merging two chains.
1210     MergeGainT Gain = MergeGainT();
1211 
1212     /// Given a list of merge types, try to merge two chains and update Gain
1213     /// with a better alternative.
1214     auto tryChainMerging = [&](const std::vector<MergeTypeT> &MergeTypes) {
1215       // Apply the merge, compute the corresponding gain, and update the best
1216       // value, if the merge is beneficial.
1217       for (const MergeTypeT &MergeType : MergeTypes) {
1218         MergeGainT NewGain =
1219             computeMergeGain(SrcChain, DstChain, Jumps, MergeType);
1220 
1221         // When forward and backward gains are the same, prioritize merging that
1222         // preserves the original order of the functions in the binary.
1223         if (std::abs(Gain.score() - NewGain.score()) < EPS) {
1224           if ((MergeType == MergeTypeT::X_Y && SrcChain->Id < DstChain->Id) ||
1225               (MergeType == MergeTypeT::Y_X && SrcChain->Id > DstChain->Id)) {
1226             Gain = NewGain;
1227           }
1228         } else if (NewGain.score() > Gain.score() + EPS) {
1229           Gain = NewGain;
1230         }
1231       }
1232     };
1233 
1234     // Try to concatenate two chains w/o splitting.
1235     tryChainMerging({MergeTypeT::X_Y, MergeTypeT::Y_X});
1236 
1237     return Gain;
1238   }
1239 
1240   /// Compute the score gain of merging two chains, respecting a given type.
1241   ///
1242   /// The two chains are not modified in the method.
1243   MergeGainT computeMergeGain(ChainT *ChainPred, ChainT *ChainSucc,
1244                               const MergedJumpsT &Jumps,
1245                               MergeTypeT MergeType) const {
1246     // This doesn't depend on the ordering of the nodes
1247     double FreqGain = freqBasedLocalityGain(ChainPred, ChainSucc);
1248 
1249     // Merge offset is always 0, as the chains are not split.
1250     size_t MergeOffset = 0;
1251     auto MergedBlocks =
1252         mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType);
1253     double DistGain = distBasedLocalityGain(MergedBlocks, Jumps);
1254 
1255     double GainScore = DistGain + Config.FrequencyScale * FreqGain;
1256     // Scale the result to increase the importance of merging short chains.
1257     if (GainScore >= 0.0)
1258       GainScore /= std::min(ChainPred->Size, ChainSucc->Size);
1259 
1260     return MergeGainT(GainScore, MergeOffset, MergeType);
1261   }
1262 
1263   /// Compute the change of the frequency locality after merging the chains.
1264   double freqBasedLocalityGain(ChainT *ChainPred, ChainT *ChainSucc) const {
1265     auto missProbability = [&](double ChainDensity) {
1266       double PageSamples = ChainDensity * Config.CacheSize;
1267       if (PageSamples >= TotalSamples)
1268         return 0.0;
1269       double P = PageSamples / TotalSamples;
1270       return pow(1.0 - P, static_cast<double>(Config.CacheEntries));
1271     };
1272 
1273     // Cache misses on the chains before merging.
1274     double CurScore =
1275         ChainPred->ExecutionCount * missProbability(ChainPred->density()) +
1276         ChainSucc->ExecutionCount * missProbability(ChainSucc->density());
1277 
1278     // Cache misses on the merged chain
1279     double MergedCounts = ChainPred->ExecutionCount + ChainSucc->ExecutionCount;
1280     double MergedSize = ChainPred->Size + ChainSucc->Size;
1281     double MergedDensity = static_cast<double>(MergedCounts) / MergedSize;
1282     double NewScore = MergedCounts * missProbability(MergedDensity);
1283 
1284     return CurScore - NewScore;
1285   }
1286 
1287   /// Compute the distance locality for a jump / call.
1288   double distScore(uint64_t SrcAddr, uint64_t DstAddr, uint64_t Count) const {
1289     uint64_t Dist = SrcAddr <= DstAddr ? DstAddr - SrcAddr : SrcAddr - DstAddr;
1290     double D = Dist == 0 ? 0.1 : static_cast<double>(Dist);
1291     return static_cast<double>(Count) * std::pow(D, -Config.DistancePower);
1292   }
1293 
1294   /// Compute the change of the distance locality after merging the chains.
1295   double distBasedLocalityGain(const MergedNodesT &Nodes,
1296                                const MergedJumpsT &Jumps) const {
1297     uint64_t CurAddr = 0;
1298     Nodes.forEach([&](const NodeT *Node) {
1299       Node->EstimatedAddr = CurAddr;
1300       CurAddr += Node->Size;
1301     });
1302 
1303     double CurScore = 0;
1304     double NewScore = 0;
1305     Jumps.forEach([&](const JumpT *Jump) {
1306       uint64_t SrcAddr = Jump->Source->EstimatedAddr + Jump->Offset;
1307       uint64_t DstAddr = Jump->Target->EstimatedAddr;
1308       NewScore += distScore(SrcAddr, DstAddr, Jump->ExecutionCount);
1309       CurScore += distScore(0, TotalSize, Jump->ExecutionCount);
1310     });
1311     return NewScore - CurScore;
1312   }
1313 
1314   /// Merge chain From into chain Into, update the list of active chains,
1315   /// adjacency information, and the corresponding cached values.
1316   void mergeChains(ChainT *Into, ChainT *From, size_t MergeOffset,
1317                    MergeTypeT MergeType) {
1318     assert(Into != From && "a chain cannot be merged with itself");
1319 
1320     // Merge the nodes.
1321     MergedNodesT MergedNodes =
1322         mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType);
1323     Into->merge(From, MergedNodes.getNodes());
1324 
1325     // Merge the edges.
1326     Into->mergeEdges(From);
1327     From->clear();
1328   }
1329 
1330   /// Concatenate all chains into the final order.
1331   std::vector<uint64_t> concatChains() {
1332     // Collect chains and calculate density stats for their sorting.
1333     std::vector<const ChainT *> SortedChains;
1334     DenseMap<const ChainT *, double> ChainDensity;
1335     for (ChainT &Chain : AllChains) {
1336       if (!Chain.Nodes.empty()) {
1337         SortedChains.push_back(&Chain);
1338         // Using doubles to avoid overflow of ExecutionCounts.
1339         double Size = 0;
1340         double ExecutionCount = 0;
1341         for (NodeT *Node : Chain.Nodes) {
1342           Size += static_cast<double>(Node->Size);
1343           ExecutionCount += static_cast<double>(Node->ExecutionCount);
1344         }
1345         assert(Size > 0 && "a chain of zero size");
1346         ChainDensity[&Chain] = ExecutionCount / Size;
1347       }
1348     }
1349 
1350     // Sort chains by density in the decreasing order.
1351     std::sort(SortedChains.begin(), SortedChains.end(),
1352               [&](const ChainT *L, const ChainT *R) {
1353                 const double DL = ChainDensity[L];
1354                 const double DR = ChainDensity[R];
1355                 // Compare by density and break ties by chain identifiers.
1356                 return std::make_tuple(-DL, L->Id) <
1357                        std::make_tuple(-DR, R->Id);
1358               });
1359 
1360     // Collect the nodes in the order specified by their chains.
1361     std::vector<uint64_t> Order;
1362     Order.reserve(NumNodes);
1363     for (const ChainT *Chain : SortedChains)
1364       for (NodeT *Node : Chain->Nodes)
1365         Order.push_back(Node->Index);
1366     return Order;
1367   }
1368 
1369 private:
1370   /// Config for the algorithm.
1371   const CDSortConfig Config;
1372 
1373   /// The number of nodes in the graph.
1374   const size_t NumNodes;
1375 
1376   /// Successors of each node.
1377   std::vector<std::vector<uint64_t>> SuccNodes;
1378 
1379   /// Predecessors of each node.
1380   std::vector<std::vector<uint64_t>> PredNodes;
1381 
1382   /// All nodes (functions) in the graph.
1383   std::vector<NodeT> AllNodes;
1384 
1385   /// All jumps (function calls) between the nodes.
1386   std::vector<JumpT> AllJumps;
1387 
1388   /// All chains of nodes.
1389   std::vector<ChainT> AllChains;
1390 
1391   /// All edges between the chains.
1392   std::vector<ChainEdge> AllEdges;
1393 
1394   /// The total number of samples in the graph.
1395   uint64_t TotalSamples{0};
1396 
1397   /// The total size of the nodes in the graph.
1398   uint64_t TotalSize{0};
1399 };
1400 
1401 } // end of anonymous namespace
1402 
1403 std::vector<uint64_t>
1404 codelayout::computeExtTspLayout(ArrayRef<uint64_t> NodeSizes,
1405                                 ArrayRef<uint64_t> NodeCounts,
1406                                 ArrayRef<EdgeCount> EdgeCounts) {
1407   // Verify correctness of the input data.
1408   assert(NodeCounts.size() == NodeSizes.size() && "Incorrect input");
1409   assert(NodeSizes.size() > 2 && "Incorrect input");
1410 
1411   // Apply the reordering algorithm.
1412   ExtTSPImpl Alg(NodeSizes, NodeCounts, EdgeCounts);
1413   std::vector<uint64_t> Result = Alg.run();
1414 
1415   // Verify correctness of the output.
1416   assert(Result.front() == 0 && "Original entry point is not preserved");
1417   assert(Result.size() == NodeSizes.size() && "Incorrect size of layout");
1418   return Result;
1419 }
1420 
1421 double codelayout::calcExtTspScore(ArrayRef<uint64_t> Order,
1422                                    ArrayRef<uint64_t> NodeSizes,
1423                                    ArrayRef<uint64_t> NodeCounts,
1424                                    ArrayRef<EdgeCount> EdgeCounts) {
1425   // Estimate addresses of the blocks in memory.
1426   std::vector<uint64_t> Addr(NodeSizes.size(), 0);
1427   for (size_t Idx = 1; Idx < Order.size(); Idx++) {
1428     Addr[Order[Idx]] = Addr[Order[Idx - 1]] + NodeSizes[Order[Idx - 1]];
1429   }
1430   std::vector<uint64_t> OutDegree(NodeSizes.size(), 0);
1431   for (auto Edge : EdgeCounts)
1432     ++OutDegree[Edge.src];
1433 
1434   // Increase the score for each jump.
1435   double Score = 0;
1436   for (auto Edge : EdgeCounts) {
1437     bool IsConditional = OutDegree[Edge.src] > 1;
1438     Score += ::extTSPScore(Addr[Edge.src], NodeSizes[Edge.src], Addr[Edge.dst],
1439                            Edge.count, IsConditional);
1440   }
1441   return Score;
1442 }
1443 
1444 double codelayout::calcExtTspScore(ArrayRef<uint64_t> NodeSizes,
1445                                    ArrayRef<uint64_t> NodeCounts,
1446                                    ArrayRef<EdgeCount> EdgeCounts) {
1447   std::vector<uint64_t> Order(NodeSizes.size());
1448   for (size_t Idx = 0; Idx < NodeSizes.size(); Idx++) {
1449     Order[Idx] = Idx;
1450   }
1451   return calcExtTspScore(Order, NodeSizes, NodeCounts, EdgeCounts);
1452 }
1453 
1454 std::vector<uint64_t> codelayout::computeCacheDirectedLayout(
1455     const CDSortConfig &Config, ArrayRef<uint64_t> FuncSizes,
1456     ArrayRef<uint64_t> FuncCounts, ArrayRef<EdgeCount> CallCounts,
1457     ArrayRef<uint64_t> CallOffsets) {
1458   // Verify correctness of the input data.
1459   assert(FuncCounts.size() == FuncSizes.size() && "Incorrect input");
1460 
1461   // Apply the reordering algorithm.
1462   CDSortImpl Alg(Config, FuncSizes, FuncCounts, CallCounts, CallOffsets);
1463   std::vector<uint64_t> Result = Alg.run();
1464   assert(Result.size() == FuncSizes.size() && "Incorrect size of layout");
1465   return Result;
1466 }
1467 
1468 std::vector<uint64_t> codelayout::computeCacheDirectedLayout(
1469     ArrayRef<uint64_t> FuncSizes, ArrayRef<uint64_t> FuncCounts,
1470     ArrayRef<EdgeCount> CallCounts, ArrayRef<uint64_t> CallOffsets) {
1471   CDSortConfig Config;
1472   // Populate the config from the command-line options.
1473   if (CacheEntries.getNumOccurrences() > 0)
1474     Config.CacheEntries = CacheEntries;
1475   if (CacheSize.getNumOccurrences() > 0)
1476     Config.CacheSize = CacheSize;
1477   if (CDMaxChainSize.getNumOccurrences() > 0)
1478     Config.MaxChainSize = CDMaxChainSize;
1479   if (DistancePower.getNumOccurrences() > 0)
1480     Config.DistancePower = DistancePower;
1481   if (FrequencyScale.getNumOccurrences() > 0)
1482     Config.FrequencyScale = FrequencyScale;
1483   return computeCacheDirectedLayout(Config, FuncSizes, FuncCounts, CallCounts,
1484                                     CallOffsets);
1485 }
1486