xref: /llvm-project/llvm/lib/Transforms/Utils/CodeLayout.cpp (revision a24418375a707b7314e7687888dd02eaefca42cd)
1 //===- CodeLayout.cpp - Implementation of code layout algorithms ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // The file implements "cache-aware" layout algorithms of basic blocks and
10 // functions in a binary.
11 //
12 // The algorithm tries to find a layout of nodes (basic blocks) of a given CFG
13 // optimizing jump locality and thus processor I-cache utilization. This is
14 // achieved via increasing the number of fall-through jumps and co-locating
15 // frequently executed nodes together. The name follows the underlying
16 // optimization problem, Extended-TSP, which is a generalization of classical
17 // (maximum) Traveling Salesmen Problem.
18 //
19 // The algorithm is a greedy heuristic that works with chains (ordered lists)
20 // of basic blocks. Initially all chains are isolated basic blocks. On every
21 // iteration, we pick a pair of chains whose merging yields the biggest increase
22 // in the ExtTSP score, which models how i-cache "friendly" a specific chain is.
23 // A pair of chains giving the maximum gain is merged into a new chain. The
24 // procedure stops when there is only one chain left, or when merging does not
25 // increase ExtTSP. In the latter case, the remaining chains are sorted by
26 // density in the decreasing order.
27 //
28 // An important aspect is the way two chains are merged. Unlike earlier
29 // algorithms (e.g., based on the approach of Pettis-Hansen), two
30 // chains, X and Y, are first split into three, X1, X2, and Y. Then we
31 // consider all possible ways of gluing the three chains (e.g., X1YX2, X1X2Y,
32 // X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the largest score.
33 // This improves the quality of the final result (the search space is larger)
34 // while keeping the implementation sufficiently fast.
35 //
36 // Reference:
37 //   * A. Newell and S. Pupyrev, Improved Basic Block Reordering,
38 //     IEEE Transactions on Computers, 2020
39 //     https://arxiv.org/abs/1809.04676
40 //
41 //===----------------------------------------------------------------------===//
42 
43 #include "llvm/Transforms/Utils/CodeLayout.h"
44 #include "llvm/Support/CommandLine.h"
45 #include "llvm/Support/Debug.h"
46 
47 #include <cmath>
48 #include <set>
49 
50 using namespace llvm;
51 using namespace llvm::codelayout;
52 
53 #define DEBUG_TYPE "code-layout"
54 
55 namespace llvm {
56 cl::opt<bool> EnableExtTspBlockPlacement(
57     "enable-ext-tsp-block-placement", cl::Hidden, cl::init(false),
58     cl::desc("Enable machine block placement based on the ext-tsp model, "
59              "optimizing I-cache utilization."));
60 
61 cl::opt<bool> ApplyExtTspWithoutProfile(
62     "ext-tsp-apply-without-profile",
63     cl::desc("Whether to apply ext-tsp placement for instances w/o profile"),
64     cl::init(true), cl::Hidden);
65 } // namespace llvm
66 
67 // Algorithm-specific params for Ext-TSP. The values are tuned for the best
68 // performance of large-scale front-end bound binaries.
69 static cl::opt<double> ForwardWeightCond(
70     "ext-tsp-forward-weight-cond", cl::ReallyHidden, cl::init(0.1),
71     cl::desc("The weight of conditional forward jumps for ExtTSP value"));
72 
73 static cl::opt<double> ForwardWeightUncond(
74     "ext-tsp-forward-weight-uncond", cl::ReallyHidden, cl::init(0.1),
75     cl::desc("The weight of unconditional forward jumps for ExtTSP value"));
76 
77 static cl::opt<double> BackwardWeightCond(
78     "ext-tsp-backward-weight-cond", cl::ReallyHidden, cl::init(0.1),
79     cl::desc("The weight of conditional backward jumps for ExtTSP value"));
80 
81 static cl::opt<double> BackwardWeightUncond(
82     "ext-tsp-backward-weight-uncond", cl::ReallyHidden, cl::init(0.1),
83     cl::desc("The weight of unconditional backward jumps for ExtTSP value"));
84 
85 static cl::opt<double> FallthroughWeightCond(
86     "ext-tsp-fallthrough-weight-cond", cl::ReallyHidden, cl::init(1.0),
87     cl::desc("The weight of conditional fallthrough jumps for ExtTSP value"));
88 
89 static cl::opt<double> FallthroughWeightUncond(
90     "ext-tsp-fallthrough-weight-uncond", cl::ReallyHidden, cl::init(1.05),
91     cl::desc("The weight of unconditional fallthrough jumps for ExtTSP value"));
92 
93 static cl::opt<unsigned> ForwardDistance(
94     "ext-tsp-forward-distance", cl::ReallyHidden, cl::init(1024),
95     cl::desc("The maximum distance (in bytes) of a forward jump for ExtTSP"));
96 
97 static cl::opt<unsigned> BackwardDistance(
98     "ext-tsp-backward-distance", cl::ReallyHidden, cl::init(640),
99     cl::desc("The maximum distance (in bytes) of a backward jump for ExtTSP"));
100 
101 // The maximum size of a chain created by the algorithm. The size is bounded
102 // so that the algorithm can efficiently process extremely large instances.
103 static cl::opt<unsigned>
104     MaxChainSize("ext-tsp-max-chain-size", cl::ReallyHidden, cl::init(4096),
105                  cl::desc("The maximum size of a chain to create."));
106 
107 // The maximum size of a chain for splitting. Larger values of the threshold
108 // may yield better quality at the cost of worsen run-time.
109 static cl::opt<unsigned> ChainSplitThreshold(
110     "ext-tsp-chain-split-threshold", cl::ReallyHidden, cl::init(128),
111     cl::desc("The maximum size of a chain to apply splitting"));
112 
113 // The option enables splitting (large) chains along in-coming and out-going
114 // jumps. This typically results in a better quality.
115 static cl::opt<bool> EnableChainSplitAlongJumps(
116     "ext-tsp-enable-chain-split-along-jumps", cl::ReallyHidden, cl::init(true),
117     cl::desc("The maximum size of a chain to apply splitting"));
118 
119 // Algorithm-specific options for CDS.
120 static cl::opt<unsigned> CacheEntries("cds-cache-entries", cl::ReallyHidden,
121                                       cl::desc("The size of the cache"));
122 
123 static cl::opt<unsigned> CacheSize("cds-cache-size", cl::ReallyHidden,
124                                    cl::desc("The size of a line in the cache"));
125 
126 static cl::opt<unsigned>
127     CDMaxChainSize("cdsort-max-chain-size", cl::ReallyHidden,
128                    cl::desc("The maximum size of a chain to create"));
129 
130 static cl::opt<double> DistancePower(
131     "cds-distance-power", cl::ReallyHidden,
132     cl::desc("The power exponent for the distance-based locality"));
133 
134 static cl::opt<double> FrequencyScale(
135     "cds-frequency-scale", cl::ReallyHidden,
136     cl::desc("The scale factor for the frequency-based locality"));
137 
138 namespace {
139 
140 // Epsilon for comparison of doubles.
141 constexpr double EPS = 1e-8;
142 
143 // Compute the Ext-TSP score for a given jump.
144 double jumpExtTSPScore(uint64_t JumpDist, uint64_t JumpMaxDist, uint64_t Count,
145                        double Weight) {
146   if (JumpDist > JumpMaxDist)
147     return 0;
148   double Prob = 1.0 - static_cast<double>(JumpDist) / JumpMaxDist;
149   return Weight * Prob * Count;
150 }
151 
152 // Compute the Ext-TSP score for a jump between a given pair of blocks,
153 // using their sizes, (estimated) addresses and the jump execution count.
154 double extTSPScore(uint64_t SrcAddr, uint64_t SrcSize, uint64_t DstAddr,
155                    uint64_t Count, bool IsConditional) {
156   // Fallthrough
157   if (SrcAddr + SrcSize == DstAddr) {
158     return jumpExtTSPScore(0, 1, Count,
159                            IsConditional ? FallthroughWeightCond
160                                          : FallthroughWeightUncond);
161   }
162   // Forward
163   if (SrcAddr + SrcSize < DstAddr) {
164     const uint64_t Dist = DstAddr - (SrcAddr + SrcSize);
165     return jumpExtTSPScore(Dist, ForwardDistance, Count,
166                            IsConditional ? ForwardWeightCond
167                                          : ForwardWeightUncond);
168   }
169   // Backward
170   const uint64_t Dist = SrcAddr + SrcSize - DstAddr;
171   return jumpExtTSPScore(Dist, BackwardDistance, Count,
172                          IsConditional ? BackwardWeightCond
173                                        : BackwardWeightUncond);
174 }
175 
176 /// A type of merging two chains, X and Y. The former chain is split into
177 /// X1 and X2 and then concatenated with Y in the order specified by the type.
178 enum class MergeTypeT : int { X_Y, Y_X, X1_Y_X2, Y_X2_X1, X2_X1_Y };
179 
180 /// The gain of merging two chains, that is, the Ext-TSP score of the merge
181 /// together with the corresponding merge 'type' and 'offset'.
182 struct MergeGainT {
183   explicit MergeGainT() = default;
184   explicit MergeGainT(double Score, size_t MergeOffset, MergeTypeT MergeType)
185       : Score(Score), MergeOffset(MergeOffset), MergeType(MergeType) {}
186 
187   double score() const { return Score; }
188 
189   size_t mergeOffset() const { return MergeOffset; }
190 
191   MergeTypeT mergeType() const { return MergeType; }
192 
193   void setMergeType(MergeTypeT Ty) { MergeType = Ty; }
194 
195   // Returns 'true' iff Other is preferred over this.
196   bool operator<(const MergeGainT &Other) const {
197     return (Other.Score > EPS && Other.Score > Score + EPS);
198   }
199 
200   // Update the current gain if Other is preferred over this.
201   void updateIfLessThan(const MergeGainT &Other) {
202     if (*this < Other)
203       *this = Other;
204   }
205 
206 private:
207   double Score{-1.0};
208   size_t MergeOffset{0};
209   MergeTypeT MergeType{MergeTypeT::X_Y};
210 };
211 
212 struct JumpT;
213 struct ChainT;
214 struct ChainEdge;
215 
216 /// A node in the graph, typically corresponding to a basic block in the CFG or
217 /// a function in the call graph.
218 struct NodeT {
219   NodeT(const NodeT &) = delete;
220   NodeT(NodeT &&) = default;
221   NodeT &operator=(const NodeT &) = delete;
222   NodeT &operator=(NodeT &&) = default;
223 
224   explicit NodeT(size_t Index, uint64_t Size, uint64_t Count)
225       : Index(Index), Size(Size), ExecutionCount(Count) {}
226 
227   bool isEntry() const { return Index == 0; }
228 
229   // The total execution count of outgoing jumps.
230   uint64_t outCount() const;
231 
232   // The total execution count of incoming jumps.
233   uint64_t inCount() const;
234 
235   // The original index of the node in graph.
236   size_t Index{0};
237   // The index of the node in the current chain.
238   size_t CurIndex{0};
239   // The size of the node in the binary.
240   uint64_t Size{0};
241   // The execution count of the node in the profile data.
242   uint64_t ExecutionCount{0};
243   // The current chain of the node.
244   ChainT *CurChain{nullptr};
245   // The offset of the node in the current chain.
246   mutable uint64_t EstimatedAddr{0};
247   // Forced successor of the node in the graph.
248   NodeT *ForcedSucc{nullptr};
249   // Forced predecessor of the node in the graph.
250   NodeT *ForcedPred{nullptr};
251   // Outgoing jumps from the node.
252   std::vector<JumpT *> OutJumps;
253   // Incoming jumps to the node.
254   std::vector<JumpT *> InJumps;
255 };
256 
257 /// An arc in the graph, typically corresponding to a jump between two nodes.
258 struct JumpT {
259   JumpT(const JumpT &) = delete;
260   JumpT(JumpT &&) = default;
261   JumpT &operator=(const JumpT &) = delete;
262   JumpT &operator=(JumpT &&) = default;
263 
264   explicit JumpT(NodeT *Source, NodeT *Target, uint64_t ExecutionCount)
265       : Source(Source), Target(Target), ExecutionCount(ExecutionCount) {}
266 
267   // Source node of the jump.
268   NodeT *Source;
269   // Target node of the jump.
270   NodeT *Target;
271   // Execution count of the arc in the profile data.
272   uint64_t ExecutionCount{0};
273   // Whether the jump corresponds to a conditional branch.
274   bool IsConditional{false};
275   // The offset of the jump from the source node.
276   uint64_t Offset{0};
277 };
278 
279 /// A chain (ordered sequence) of nodes in the graph.
280 struct ChainT {
281   ChainT(const ChainT &) = delete;
282   ChainT(ChainT &&) = default;
283   ChainT &operator=(const ChainT &) = delete;
284   ChainT &operator=(ChainT &&) = default;
285 
286   explicit ChainT(uint64_t Id, NodeT *Node)
287       : Id(Id), ExecutionCount(Node->ExecutionCount), Size(Node->Size),
288         Nodes(1, Node) {}
289 
290   size_t numBlocks() const { return Nodes.size(); }
291 
292   double density() const { return static_cast<double>(ExecutionCount) / Size; }
293 
294   bool isEntry() const { return Nodes[0]->Index == 0; }
295 
296   bool isCold() const {
297     for (NodeT *Node : Nodes) {
298       if (Node->ExecutionCount > 0)
299         return false;
300     }
301     return true;
302   }
303 
304   ChainEdge *getEdge(ChainT *Other) const {
305     for (const auto &[Chain, ChainEdge] : Edges) {
306       if (Chain == Other)
307         return ChainEdge;
308     }
309     return nullptr;
310   }
311 
312   void removeEdge(ChainT *Other) {
313     auto It = Edges.begin();
314     while (It != Edges.end()) {
315       if (It->first == Other) {
316         Edges.erase(It);
317         return;
318       }
319       It++;
320     }
321   }
322 
323   void addEdge(ChainT *Other, ChainEdge *Edge) {
324     Edges.push_back(std::make_pair(Other, Edge));
325   }
326 
327   void merge(ChainT *Other, std::vector<NodeT *> MergedBlocks) {
328     Nodes = std::move(MergedBlocks);
329     // Update the chain's data.
330     ExecutionCount += Other->ExecutionCount;
331     Size += Other->Size;
332     Id = Nodes[0]->Index;
333     // Update the node's data.
334     for (size_t Idx = 0; Idx < Nodes.size(); Idx++) {
335       Nodes[Idx]->CurChain = this;
336       Nodes[Idx]->CurIndex = Idx;
337     }
338   }
339 
340   void mergeEdges(ChainT *Other);
341 
342   void clear() {
343     Nodes.clear();
344     Nodes.shrink_to_fit();
345     Edges.clear();
346     Edges.shrink_to_fit();
347   }
348 
349   // Unique chain identifier.
350   uint64_t Id;
351   // Cached ext-tsp score for the chain.
352   double Score{0};
353   // The total execution count of the chain.
354   uint64_t ExecutionCount{0};
355   // The total size of the chain.
356   uint64_t Size{0};
357   // Nodes of the chain.
358   std::vector<NodeT *> Nodes;
359   // Adjacent chains and corresponding edges (lists of jumps).
360   std::vector<std::pair<ChainT *, ChainEdge *>> Edges;
361 };
362 
363 /// An edge in the graph representing jumps between two chains.
364 /// When nodes are merged into chains, the edges are combined too so that
365 /// there is always at most one edge between a pair of chains.
366 struct ChainEdge {
367   ChainEdge(const ChainEdge &) = delete;
368   ChainEdge(ChainEdge &&) = default;
369   ChainEdge &operator=(const ChainEdge &) = delete;
370   ChainEdge &operator=(ChainEdge &&) = delete;
371 
372   explicit ChainEdge(JumpT *Jump)
373       : SrcChain(Jump->Source->CurChain), DstChain(Jump->Target->CurChain),
374         Jumps(1, Jump) {}
375 
376   ChainT *srcChain() const { return SrcChain; }
377 
378   ChainT *dstChain() const { return DstChain; }
379 
380   bool isSelfEdge() const { return SrcChain == DstChain; }
381 
382   const std::vector<JumpT *> &jumps() const { return Jumps; }
383 
384   void appendJump(JumpT *Jump) { Jumps.push_back(Jump); }
385 
386   void moveJumps(ChainEdge *Other) {
387     Jumps.insert(Jumps.end(), Other->Jumps.begin(), Other->Jumps.end());
388     Other->Jumps.clear();
389     Other->Jumps.shrink_to_fit();
390   }
391 
392   void changeEndpoint(ChainT *From, ChainT *To) {
393     if (From == SrcChain)
394       SrcChain = To;
395     if (From == DstChain)
396       DstChain = To;
397   }
398 
399   bool hasCachedMergeGain(ChainT *Src, ChainT *Dst) const {
400     return Src == SrcChain ? CacheValidForward : CacheValidBackward;
401   }
402 
403   MergeGainT getCachedMergeGain(ChainT *Src, ChainT *Dst) const {
404     return Src == SrcChain ? CachedGainForward : CachedGainBackward;
405   }
406 
407   void setCachedMergeGain(ChainT *Src, ChainT *Dst, MergeGainT MergeGain) {
408     if (Src == SrcChain) {
409       CachedGainForward = MergeGain;
410       CacheValidForward = true;
411     } else {
412       CachedGainBackward = MergeGain;
413       CacheValidBackward = true;
414     }
415   }
416 
417   void invalidateCache() {
418     CacheValidForward = false;
419     CacheValidBackward = false;
420   }
421 
422   void setMergeGain(MergeGainT Gain) { CachedGain = Gain; }
423 
424   MergeGainT getMergeGain() const { return CachedGain; }
425 
426   double gain() const { return CachedGain.score(); }
427 
428 private:
429   // Source chain.
430   ChainT *SrcChain{nullptr};
431   // Destination chain.
432   ChainT *DstChain{nullptr};
433   // Original jumps in the binary with corresponding execution counts.
434   std::vector<JumpT *> Jumps;
435   // Cached gain value for merging the pair of chains.
436   MergeGainT CachedGain;
437 
438   // Cached gain values for merging the pair of chains. Since the gain of
439   // merging (Src, Dst) and (Dst, Src) might be different, we store both values
440   // here and a flag indicating which of the options results in a higher gain.
441   // Cached gain values.
442   MergeGainT CachedGainForward;
443   MergeGainT CachedGainBackward;
444   // Whether the cached value must be recomputed.
445   bool CacheValidForward{false};
446   bool CacheValidBackward{false};
447 };
448 
449 uint64_t NodeT::outCount() const {
450   uint64_t Count = 0;
451   for (JumpT *Jump : OutJumps)
452     Count += Jump->ExecutionCount;
453   return Count;
454 }
455 
456 uint64_t NodeT::inCount() const {
457   uint64_t Count = 0;
458   for (JumpT *Jump : InJumps)
459     Count += Jump->ExecutionCount;
460   return Count;
461 }
462 
463 void ChainT::mergeEdges(ChainT *Other) {
464   // Update edges adjacent to chain Other.
465   for (const auto &[DstChain, DstEdge] : Other->Edges) {
466     ChainT *TargetChain = DstChain == Other ? this : DstChain;
467     ChainEdge *CurEdge = getEdge(TargetChain);
468     if (CurEdge == nullptr) {
469       DstEdge->changeEndpoint(Other, this);
470       this->addEdge(TargetChain, DstEdge);
471       if (DstChain != this && DstChain != Other)
472         DstChain->addEdge(this, DstEdge);
473     } else {
474       CurEdge->moveJumps(DstEdge);
475     }
476     // Cleanup leftover edge.
477     if (DstChain != Other)
478       DstChain->removeEdge(Other);
479   }
480 }
481 
482 using NodeIter = std::vector<NodeT *>::const_iterator;
483 static std::vector<NodeT *> EmptyList;
484 
485 /// A wrapper around three concatenated vectors (chains) of nodes; it is used
486 /// to avoid extra instantiation of the vectors.
487 struct MergedNodesT {
488   MergedNodesT(NodeIter Begin1, NodeIter End1,
489                NodeIter Begin2 = EmptyList.begin(),
490                NodeIter End2 = EmptyList.end(),
491                NodeIter Begin3 = EmptyList.begin(),
492                NodeIter End3 = EmptyList.end())
493       : Begin1(Begin1), End1(End1), Begin2(Begin2), End2(End2), Begin3(Begin3),
494         End3(End3) {}
495 
496   template <typename F> void forEach(const F &Func) const {
497     for (auto It = Begin1; It != End1; It++)
498       Func(*It);
499     for (auto It = Begin2; It != End2; It++)
500       Func(*It);
501     for (auto It = Begin3; It != End3; It++)
502       Func(*It);
503   }
504 
505   std::vector<NodeT *> getNodes() const {
506     std::vector<NodeT *> Result;
507     Result.reserve(std::distance(Begin1, End1) + std::distance(Begin2, End2) +
508                    std::distance(Begin3, End3));
509     Result.insert(Result.end(), Begin1, End1);
510     Result.insert(Result.end(), Begin2, End2);
511     Result.insert(Result.end(), Begin3, End3);
512     return Result;
513   }
514 
515   const NodeT *getFirstNode() const { return *Begin1; }
516 
517   bool empty() const { return Begin1 == End1; }
518 
519 private:
520   NodeIter Begin1;
521   NodeIter End1;
522   NodeIter Begin2;
523   NodeIter End2;
524   NodeIter Begin3;
525   NodeIter End3;
526 };
527 
528 /// A wrapper around two concatenated vectors (chains) of jumps.
529 struct MergedJumpsT {
530   MergedJumpsT(const std::vector<JumpT *> *Jumps1,
531                const std::vector<JumpT *> *Jumps2 = nullptr) {
532     assert(!Jumps1->empty() && "cannot merge empty jump list");
533     JumpArray[0] = Jumps1;
534     JumpArray[1] = Jumps2;
535   }
536 
537   template <typename F> void forEach(const F &Func) const {
538     for (auto Jumps : JumpArray)
539       if (Jumps != nullptr)
540         for (JumpT *Jump : *Jumps)
541           Func(Jump);
542   }
543 
544 private:
545   std::array<const std::vector<JumpT *> *, 2> JumpArray{nullptr, nullptr};
546 };
547 
548 /// Merge two chains of nodes respecting a given 'type' and 'offset'.
549 ///
550 /// If MergeType == 0, then the result is a concatenation of two chains.
551 /// Otherwise, the first chain is cut into two sub-chains at the offset,
552 /// and merged using all possible ways of concatenating three chains.
553 MergedNodesT mergeNodes(const std::vector<NodeT *> &X,
554                         const std::vector<NodeT *> &Y, size_t MergeOffset,
555                         MergeTypeT MergeType) {
556   // Split the first chain, X, into X1 and X2.
557   NodeIter BeginX1 = X.begin();
558   NodeIter EndX1 = X.begin() + MergeOffset;
559   NodeIter BeginX2 = X.begin() + MergeOffset;
560   NodeIter EndX2 = X.end();
561   NodeIter BeginY = Y.begin();
562   NodeIter EndY = Y.end();
563 
564   // Construct a new chain from the three existing ones.
565   switch (MergeType) {
566   case MergeTypeT::X_Y:
567     return MergedNodesT(BeginX1, EndX2, BeginY, EndY);
568   case MergeTypeT::Y_X:
569     return MergedNodesT(BeginY, EndY, BeginX1, EndX2);
570   case MergeTypeT::X1_Y_X2:
571     return MergedNodesT(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2);
572   case MergeTypeT::Y_X2_X1:
573     return MergedNodesT(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1);
574   case MergeTypeT::X2_X1_Y:
575     return MergedNodesT(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY);
576   }
577   llvm_unreachable("unexpected chain merge type");
578 }
579 
580 /// The implementation of the ExtTSP algorithm.
581 class ExtTSPImpl {
582 public:
583   ExtTSPImpl(ArrayRef<uint64_t> NodeSizes, ArrayRef<uint64_t> NodeCounts,
584              ArrayRef<EdgeCount> EdgeCounts)
585       : NumNodes(NodeSizes.size()) {
586     initialize(NodeSizes, NodeCounts, EdgeCounts);
587   }
588 
589   /// Run the algorithm and return an optimized ordering of nodes.
590   std::vector<uint64_t> run() {
591     // Pass 1: Merge nodes with their mutually forced successors
592     mergeForcedPairs();
593 
594     // Pass 2: Merge pairs of chains while improving the ExtTSP objective
595     mergeChainPairs();
596 
597     // Pass 3: Merge cold nodes to reduce code size
598     mergeColdChains();
599 
600     // Collect nodes from all chains
601     return concatChains();
602   }
603 
604 private:
605   /// Initialize the algorithm's data structures.
606   void initialize(const ArrayRef<uint64_t> &NodeSizes,
607                   const ArrayRef<uint64_t> &NodeCounts,
608                   const ArrayRef<EdgeCount> &EdgeCounts) {
609     // Initialize nodes
610     AllNodes.reserve(NumNodes);
611     for (uint64_t Idx = 0; Idx < NumNodes; Idx++) {
612       uint64_t Size = std::max<uint64_t>(NodeSizes[Idx], 1ULL);
613       uint64_t ExecutionCount = NodeCounts[Idx];
614       // The execution count of the entry node is set to at least one.
615       if (Idx == 0 && ExecutionCount == 0)
616         ExecutionCount = 1;
617       AllNodes.emplace_back(Idx, Size, ExecutionCount);
618     }
619 
620     // Initialize jumps between nodes
621     SuccNodes.resize(NumNodes);
622     PredNodes.resize(NumNodes);
623     std::vector<uint64_t> OutDegree(NumNodes, 0);
624     AllJumps.reserve(EdgeCounts.size());
625     for (auto Edge : EdgeCounts) {
626       ++OutDegree[Edge.src];
627       // Ignore self-edges.
628       if (Edge.src == Edge.dst)
629         continue;
630 
631       SuccNodes[Edge.src].push_back(Edge.dst);
632       PredNodes[Edge.dst].push_back(Edge.src);
633       if (Edge.count > 0) {
634         NodeT &PredNode = AllNodes[Edge.src];
635         NodeT &SuccNode = AllNodes[Edge.dst];
636         AllJumps.emplace_back(&PredNode, &SuccNode, Edge.count);
637         SuccNode.InJumps.push_back(&AllJumps.back());
638         PredNode.OutJumps.push_back(&AllJumps.back());
639       }
640     }
641     for (JumpT &Jump : AllJumps) {
642       assert(OutDegree[Jump.Source->Index] > 0);
643       Jump.IsConditional = OutDegree[Jump.Source->Index] > 1;
644     }
645 
646     // Initialize chains.
647     AllChains.reserve(NumNodes);
648     HotChains.reserve(NumNodes);
649     for (NodeT &Node : AllNodes) {
650       // Create a chain.
651       AllChains.emplace_back(Node.Index, &Node);
652       Node.CurChain = &AllChains.back();
653       if (Node.ExecutionCount > 0)
654         HotChains.push_back(&AllChains.back());
655     }
656 
657     // Initialize chain edges.
658     AllEdges.reserve(AllJumps.size());
659     for (NodeT &PredNode : AllNodes) {
660       for (JumpT *Jump : PredNode.OutJumps) {
661         NodeT *SuccNode = Jump->Target;
662         ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain);
663         // This edge is already present in the graph.
664         if (CurEdge != nullptr) {
665           assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr);
666           CurEdge->appendJump(Jump);
667           continue;
668         }
669         // This is a new edge.
670         AllEdges.emplace_back(Jump);
671         PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back());
672         SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back());
673       }
674     }
675   }
676 
677   /// For a pair of nodes, A and B, node B is the forced successor of A,
678   /// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps
679   /// to B are from A. Such nodes should be adjacent in the optimal ordering;
680   /// the method finds and merges such pairs of nodes.
681   void mergeForcedPairs() {
682     // Find forced pairs of blocks.
683     for (NodeT &Node : AllNodes) {
684       if (SuccNodes[Node.Index].size() == 1 &&
685           PredNodes[SuccNodes[Node.Index][0]].size() == 1 &&
686           SuccNodes[Node.Index][0] != 0) {
687         size_t SuccIndex = SuccNodes[Node.Index][0];
688         Node.ForcedSucc = &AllNodes[SuccIndex];
689         AllNodes[SuccIndex].ForcedPred = &Node;
690       }
691     }
692 
693     // There might be 'cycles' in the forced dependencies, since profile
694     // data isn't 100% accurate. Typically this is observed in loops, when the
695     // loop edges are the hottest successors for the basic blocks of the loop.
696     // Break the cycles by choosing the node with the smallest index as the
697     // head. This helps to keep the original order of the loops, which likely
698     // have already been rotated in the optimized manner.
699     for (NodeT &Node : AllNodes) {
700       if (Node.ForcedSucc == nullptr || Node.ForcedPred == nullptr)
701         continue;
702 
703       NodeT *SuccNode = Node.ForcedSucc;
704       while (SuccNode != nullptr && SuccNode != &Node) {
705         SuccNode = SuccNode->ForcedSucc;
706       }
707       if (SuccNode == nullptr)
708         continue;
709       // Break the cycle.
710       AllNodes[Node.ForcedPred->Index].ForcedSucc = nullptr;
711       Node.ForcedPred = nullptr;
712     }
713 
714     // Merge nodes with their fallthrough successors.
715     for (NodeT &Node : AllNodes) {
716       if (Node.ForcedPred == nullptr && Node.ForcedSucc != nullptr) {
717         const NodeT *CurBlock = &Node;
718         while (CurBlock->ForcedSucc != nullptr) {
719           const NodeT *NextBlock = CurBlock->ForcedSucc;
720           mergeChains(Node.CurChain, NextBlock->CurChain, 0, MergeTypeT::X_Y);
721           CurBlock = NextBlock;
722         }
723       }
724     }
725   }
726 
727   /// Merge pairs of chains while improving the ExtTSP objective.
728   void mergeChainPairs() {
729     /// Deterministically compare pairs of chains.
730     auto compareChainPairs = [](const ChainT *A1, const ChainT *B1,
731                                 const ChainT *A2, const ChainT *B2) {
732       return std::make_tuple(A1->Id, B1->Id) < std::make_tuple(A2->Id, B2->Id);
733     };
734 
735     while (HotChains.size() > 1) {
736       ChainT *BestChainPred = nullptr;
737       ChainT *BestChainSucc = nullptr;
738       MergeGainT BestGain;
739       // Iterate over all pairs of chains.
740       for (ChainT *ChainPred : HotChains) {
741         // Get candidates for merging with the current chain.
742         for (const auto &[ChainSucc, Edge] : ChainPred->Edges) {
743           // Ignore loop edges.
744           if (ChainPred == ChainSucc)
745             continue;
746 
747           // Stop early if the combined chain violates the maximum allowed size.
748           if (ChainPred->numBlocks() + ChainSucc->numBlocks() >= MaxChainSize)
749             continue;
750 
751           // Compute the gain of merging the two chains.
752           MergeGainT CurGain = getBestMergeGain(ChainPred, ChainSucc, Edge);
753           if (CurGain.score() <= EPS)
754             continue;
755 
756           if (BestGain < CurGain ||
757               (std::abs(CurGain.score() - BestGain.score()) < EPS &&
758                compareChainPairs(ChainPred, ChainSucc, BestChainPred,
759                                  BestChainSucc))) {
760             BestGain = CurGain;
761             BestChainPred = ChainPred;
762             BestChainSucc = ChainSucc;
763           }
764         }
765       }
766 
767       // Stop merging when there is no improvement.
768       if (BestGain.score() <= EPS)
769         break;
770 
771       // Merge the best pair of chains.
772       mergeChains(BestChainPred, BestChainSucc, BestGain.mergeOffset(),
773                   BestGain.mergeType());
774     }
775   }
776 
777   /// Merge remaining nodes into chains w/o taking jump counts into
778   /// consideration. This allows to maintain the original node order in the
779   /// absence of profile data.
780   void mergeColdChains() {
781     for (size_t SrcBB = 0; SrcBB < NumNodes; SrcBB++) {
782       // Iterating in reverse order to make sure original fallthrough jumps are
783       // merged first; this might be beneficial for code size.
784       size_t NumSuccs = SuccNodes[SrcBB].size();
785       for (size_t Idx = 0; Idx < NumSuccs; Idx++) {
786         size_t DstBB = SuccNodes[SrcBB][NumSuccs - Idx - 1];
787         ChainT *SrcChain = AllNodes[SrcBB].CurChain;
788         ChainT *DstChain = AllNodes[DstBB].CurChain;
789         if (SrcChain != DstChain && !DstChain->isEntry() &&
790             SrcChain->Nodes.back()->Index == SrcBB &&
791             DstChain->Nodes.front()->Index == DstBB &&
792             SrcChain->isCold() == DstChain->isCold()) {
793           mergeChains(SrcChain, DstChain, 0, MergeTypeT::X_Y);
794         }
795       }
796     }
797   }
798 
799   /// Compute the Ext-TSP score for a given node order and a list of jumps.
800   double extTSPScore(const MergedNodesT &Nodes,
801                      const MergedJumpsT &Jumps) const {
802     uint64_t CurAddr = 0;
803     Nodes.forEach([&](const NodeT *Node) {
804       Node->EstimatedAddr = CurAddr;
805       CurAddr += Node->Size;
806     });
807 
808     double Score = 0;
809     Jumps.forEach([&](const JumpT *Jump) {
810       const NodeT *SrcBlock = Jump->Source;
811       const NodeT *DstBlock = Jump->Target;
812       Score += ::extTSPScore(SrcBlock->EstimatedAddr, SrcBlock->Size,
813                              DstBlock->EstimatedAddr, Jump->ExecutionCount,
814                              Jump->IsConditional);
815     });
816     return Score;
817   }
818 
819   /// Compute the gain of merging two chains.
820   ///
821   /// The function considers all possible ways of merging two chains and
822   /// computes the one having the largest increase in ExtTSP objective. The
823   /// result is a pair with the first element being the gain and the second
824   /// element being the corresponding merging type.
825   MergeGainT getBestMergeGain(ChainT *ChainPred, ChainT *ChainSucc,
826                               ChainEdge *Edge) const {
827     if (Edge->hasCachedMergeGain(ChainPred, ChainSucc))
828       return Edge->getCachedMergeGain(ChainPred, ChainSucc);
829 
830     assert(!Edge->jumps().empty() && "trying to merge chains w/o jumps");
831     // Precompute jumps between ChainPred and ChainSucc.
832     ChainEdge *EdgePP = ChainPred->getEdge(ChainPred);
833     MergedJumpsT Jumps(&Edge->jumps(), EdgePP ? &EdgePP->jumps() : nullptr);
834 
835     // This object holds the best chosen gain of merging two chains.
836     MergeGainT Gain = MergeGainT();
837 
838     /// Given a merge offset and a list of merge types, try to merge two chains
839     /// and update Gain with a better alternative.
840     auto tryChainMerging = [&](size_t Offset,
841                                const std::vector<MergeTypeT> &MergeTypes) {
842       // Skip merging corresponding to concatenation w/o splitting.
843       if (Offset == 0 || Offset == ChainPred->Nodes.size())
844         return;
845       // Skip merging if it breaks Forced successors.
846       NodeT *Node = ChainPred->Nodes[Offset - 1];
847       if (Node->ForcedSucc != nullptr)
848         return;
849       // Apply the merge, compute the corresponding gain, and update the best
850       // value, if the merge is beneficial.
851       for (const MergeTypeT &MergeType : MergeTypes) {
852         Gain.updateIfLessThan(
853             computeMergeGain(ChainPred, ChainSucc, Jumps, Offset, MergeType));
854       }
855     };
856 
857     // Try to concatenate two chains w/o splitting.
858     Gain.updateIfLessThan(
859         computeMergeGain(ChainPred, ChainSucc, Jumps, 0, MergeTypeT::X_Y));
860 
861     if (EnableChainSplitAlongJumps) {
862       // Attach (a part of) ChainPred before the first node of ChainSucc.
863       for (JumpT *Jump : ChainSucc->Nodes.front()->InJumps) {
864         const NodeT *SrcBlock = Jump->Source;
865         if (SrcBlock->CurChain != ChainPred)
866           continue;
867         size_t Offset = SrcBlock->CurIndex + 1;
868         tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::X2_X1_Y});
869       }
870 
871       // Attach (a part of) ChainPred after the last node of ChainSucc.
872       for (JumpT *Jump : ChainSucc->Nodes.back()->OutJumps) {
873         const NodeT *DstBlock = Jump->Target;
874         if (DstBlock->CurChain != ChainPred)
875           continue;
876         size_t Offset = DstBlock->CurIndex;
877         tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1});
878       }
879     }
880 
881     // Try to break ChainPred in various ways and concatenate with ChainSucc.
882     if (ChainPred->Nodes.size() <= ChainSplitThreshold) {
883       for (size_t Offset = 1; Offset < ChainPred->Nodes.size(); Offset++) {
884         // Try to split the chain in different ways. In practice, applying
885         // X2_Y_X1 merging is almost never provides benefits; thus, we exclude
886         // it from consideration to reduce the search space.
887         tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1,
888                                  MergeTypeT::X2_X1_Y});
889       }
890     }
891     Edge->setCachedMergeGain(ChainPred, ChainSucc, Gain);
892     return Gain;
893   }
894 
895   /// Compute the score gain of merging two chains, respecting a given
896   /// merge 'type' and 'offset'.
897   ///
898   /// The two chains are not modified in the method.
899   MergeGainT computeMergeGain(const ChainT *ChainPred, const ChainT *ChainSucc,
900                               const MergedJumpsT &Jumps, size_t MergeOffset,
901                               MergeTypeT MergeType) const {
902     MergedNodesT MergedNodes =
903         mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType);
904 
905     // Do not allow a merge that does not preserve the original entry point.
906     if ((ChainPred->isEntry() || ChainSucc->isEntry()) &&
907         !MergedNodes.getFirstNode()->isEntry())
908       return MergeGainT();
909 
910     // The gain for the new chain.
911     double NewScore = extTSPScore(MergedNodes, Jumps);
912     double CurScore = ChainPred->Score;
913     return MergeGainT(NewScore - CurScore, MergeOffset, MergeType);
914   }
915 
916   /// Merge chain From into chain Into, update the list of active chains,
917   /// adjacency information, and the corresponding cached values.
918   void mergeChains(ChainT *Into, ChainT *From, size_t MergeOffset,
919                    MergeTypeT MergeType) {
920     assert(Into != From && "a chain cannot be merged with itself");
921 
922     // Merge the nodes.
923     MergedNodesT MergedNodes =
924         mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType);
925     Into->merge(From, MergedNodes.getNodes());
926 
927     // Merge the edges.
928     Into->mergeEdges(From);
929     From->clear();
930 
931     // Update cached ext-tsp score for the new chain.
932     ChainEdge *SelfEdge = Into->getEdge(Into);
933     if (SelfEdge != nullptr) {
934       MergedNodes = MergedNodesT(Into->Nodes.begin(), Into->Nodes.end());
935       MergedJumpsT MergedJumps(&SelfEdge->jumps());
936       Into->Score = extTSPScore(MergedNodes, MergedJumps);
937     }
938 
939     // Remove the chain from the list of active chains.
940     llvm::erase_value(HotChains, From);
941 
942     // Invalidate caches.
943     for (auto EdgeIt : Into->Edges)
944       EdgeIt.second->invalidateCache();
945   }
946 
947   /// Concatenate all chains into the final order.
948   std::vector<uint64_t> concatChains() {
949     // Collect chains and calculate density stats for their sorting.
950     std::vector<const ChainT *> SortedChains;
951     DenseMap<const ChainT *, double> ChainDensity;
952     for (ChainT &Chain : AllChains) {
953       if (!Chain.Nodes.empty()) {
954         SortedChains.push_back(&Chain);
955         // Using doubles to avoid overflow of ExecutionCounts.
956         double Size = 0;
957         double ExecutionCount = 0;
958         for (NodeT *Node : Chain.Nodes) {
959           Size += static_cast<double>(Node->Size);
960           ExecutionCount += static_cast<double>(Node->ExecutionCount);
961         }
962         assert(Size > 0 && "a chain of zero size");
963         ChainDensity[&Chain] = ExecutionCount / Size;
964       }
965     }
966 
967     // Sorting chains by density in the decreasing order.
968     std::sort(SortedChains.begin(), SortedChains.end(),
969               [&](const ChainT *L, const ChainT *R) {
970                 // Place the entry point at the beginning of the order.
971                 if (L->isEntry() != R->isEntry())
972                   return L->isEntry();
973 
974                 const double DL = ChainDensity[L];
975                 const double DR = ChainDensity[R];
976                 // Compare by density and break ties by chain identifiers.
977                 return std::make_tuple(-DL, L->Id) <
978                        std::make_tuple(-DR, R->Id);
979               });
980 
981     // Collect the nodes in the order specified by their chains.
982     std::vector<uint64_t> Order;
983     Order.reserve(NumNodes);
984     for (const ChainT *Chain : SortedChains)
985       for (NodeT *Node : Chain->Nodes)
986         Order.push_back(Node->Index);
987     return Order;
988   }
989 
990 private:
991   /// The number of nodes in the graph.
992   const size_t NumNodes;
993 
994   /// Successors of each node.
995   std::vector<std::vector<uint64_t>> SuccNodes;
996 
997   /// Predecessors of each node.
998   std::vector<std::vector<uint64_t>> PredNodes;
999 
1000   /// All nodes (basic blocks) in the graph.
1001   std::vector<NodeT> AllNodes;
1002 
1003   /// All jumps between the nodes.
1004   std::vector<JumpT> AllJumps;
1005 
1006   /// All chains of nodes.
1007   std::vector<ChainT> AllChains;
1008 
1009   /// All edges between the chains.
1010   std::vector<ChainEdge> AllEdges;
1011 
1012   /// Active chains. The vector gets updated at runtime when chains are merged.
1013   std::vector<ChainT *> HotChains;
1014 };
1015 
1016 /// The implementation of the Cache-Directed Sort (CDS) algorithm for ordering
1017 /// functions represented by a call graph.
1018 class CDSortImpl {
1019 public:
1020   CDSortImpl(const CDSortConfig &Config, ArrayRef<uint64_t> NodeSizes,
1021              ArrayRef<uint64_t> NodeCounts, ArrayRef<EdgeCount> EdgeCounts,
1022              ArrayRef<uint64_t> EdgeOffsets)
1023       : Config(Config), NumNodes(NodeSizes.size()) {
1024     initialize(NodeSizes, NodeCounts, EdgeCounts, EdgeOffsets);
1025   }
1026 
1027   /// Run the algorithm and return an ordered set of function clusters.
1028   std::vector<uint64_t> run() {
1029     // Merge pairs of chains while improving the objective.
1030     mergeChainPairs();
1031 
1032     // Collect nodes from all the chains.
1033     return concatChains();
1034   }
1035 
1036 private:
1037   /// Initialize the algorithm's data structures.
1038   void initialize(const ArrayRef<uint64_t> &NodeSizes,
1039                   const ArrayRef<uint64_t> &NodeCounts,
1040                   const ArrayRef<EdgeCount> &EdgeCounts,
1041                   const ArrayRef<uint64_t> &EdgeOffsets) {
1042     // Initialize nodes.
1043     AllNodes.reserve(NumNodes);
1044     for (uint64_t Node = 0; Node < NumNodes; Node++) {
1045       uint64_t Size = std::max<uint64_t>(NodeSizes[Node], 1ULL);
1046       uint64_t ExecutionCount = NodeCounts[Node];
1047       AllNodes.emplace_back(Node, Size, ExecutionCount);
1048       TotalSamples += ExecutionCount;
1049       if (ExecutionCount > 0)
1050         TotalSize += Size;
1051     }
1052 
1053     // Initialize jumps between the nodes.
1054     SuccNodes.resize(NumNodes);
1055     PredNodes.resize(NumNodes);
1056     AllJumps.reserve(EdgeCounts.size());
1057     for (size_t I = 0; I < EdgeCounts.size(); I++) {
1058       auto [Pred, Succ, Count] = EdgeCounts[I];
1059       // Ignore recursive calls.
1060       if (Pred == Succ)
1061         continue;
1062 
1063       SuccNodes[Pred].push_back(Succ);
1064       PredNodes[Succ].push_back(Pred);
1065       if (Count > 0) {
1066         NodeT &PredNode = AllNodes[Pred];
1067         NodeT &SuccNode = AllNodes[Succ];
1068         AllJumps.emplace_back(&PredNode, &SuccNode, Count);
1069         AllJumps.back().Offset = EdgeOffsets[I];
1070         SuccNode.InJumps.push_back(&AllJumps.back());
1071         PredNode.OutJumps.push_back(&AllJumps.back());
1072       }
1073     }
1074 
1075     // Initialize chains.
1076     AllChains.reserve(NumNodes);
1077     for (NodeT &Node : AllNodes) {
1078       // Adjust execution counts.
1079       Node.ExecutionCount = std::max(Node.ExecutionCount, Node.inCount());
1080       Node.ExecutionCount = std::max(Node.ExecutionCount, Node.outCount());
1081       // Create chain.
1082       AllChains.emplace_back(Node.Index, &Node);
1083       Node.CurChain = &AllChains.back();
1084     }
1085 
1086     // Initialize chain edges.
1087     AllEdges.reserve(AllJumps.size());
1088     for (NodeT &PredNode : AllNodes) {
1089       for (JumpT *Jump : PredNode.OutJumps) {
1090         NodeT *SuccNode = Jump->Target;
1091         ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain);
1092         // this edge is already present in the graph.
1093         if (CurEdge != nullptr) {
1094           assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr);
1095           CurEdge->appendJump(Jump);
1096           continue;
1097         }
1098         // this is a new edge.
1099         AllEdges.emplace_back(Jump);
1100         PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back());
1101         SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back());
1102       }
1103     }
1104   }
1105 
1106   /// Merge pairs of chains while there is an improvement in the objective.
1107   void mergeChainPairs() {
1108     // Create a priority queue containing all edges ordered by the merge gain.
1109     auto GainComparator = [](ChainEdge *L, ChainEdge *R) {
1110       return std::make_tuple(-L->gain(), L->srcChain()->Id, L->dstChain()->Id) <
1111              std::make_tuple(-R->gain(), R->srcChain()->Id, R->dstChain()->Id);
1112     };
1113     std::set<ChainEdge *, decltype(GainComparator)> Queue(GainComparator);
1114 
1115     // Insert the edges into the queue.
1116     [[maybe_unused]] size_t NumActiveChains = 0;
1117     for (NodeT &Node : AllNodes) {
1118       if (Node.ExecutionCount == 0)
1119         continue;
1120       ++NumActiveChains;
1121       for (const auto &[_, Edge] : Node.CurChain->Edges) {
1122         // Ignore self-edges.
1123         if (Edge->isSelfEdge())
1124           continue;
1125         // Ignore already processed edges.
1126         if (Edge->gain() != -1.0)
1127           continue;
1128 
1129         // Compute the gain of merging the two chains.
1130         MergeGainT Gain = getBestMergeGain(Edge);
1131         Edge->setMergeGain(Gain);
1132 
1133         if (Edge->gain() > EPS)
1134           Queue.insert(Edge);
1135       }
1136     }
1137 
1138     // Merge the chains while the gain of merging is positive.
1139     while (!Queue.empty()) {
1140       // Extract the best (top) edge for merging.
1141       ChainEdge *BestEdge = *Queue.begin();
1142       Queue.erase(Queue.begin());
1143       ChainT *BestSrcChain = BestEdge->srcChain();
1144       ChainT *BestDstChain = BestEdge->dstChain();
1145 
1146       // Remove outdated edges from the queue.
1147       for (const auto &[_, ChainEdge] : BestSrcChain->Edges)
1148         Queue.erase(ChainEdge);
1149       for (const auto &[_, ChainEdge] : BestDstChain->Edges)
1150         Queue.erase(ChainEdge);
1151 
1152       // Merge the best pair of chains.
1153       MergeGainT BestGain = BestEdge->getMergeGain();
1154       mergeChains(BestSrcChain, BestDstChain, BestGain.mergeOffset(),
1155                   BestGain.mergeType());
1156       --NumActiveChains;
1157 
1158       // Insert newly created edges into the queue.
1159       for (const auto &[_, Edge] : BestSrcChain->Edges) {
1160         // Ignore loop edges.
1161         if (Edge->isSelfEdge())
1162           continue;
1163         if (Edge->srcChain()->numBlocks() + Edge->dstChain()->numBlocks() >
1164             Config.MaxChainSize)
1165           continue;
1166 
1167         // Compute the gain of merging the two chains.
1168         MergeGainT Gain = getBestMergeGain(Edge);
1169         Edge->setMergeGain(Gain);
1170 
1171         if (Edge->gain() > EPS)
1172           Queue.insert(Edge);
1173       }
1174     }
1175 
1176     LLVM_DEBUG(dbgs() << "Cache-directed function sorting reduced the number"
1177                       << " of chains from " << NumNodes << " to "
1178                       << NumActiveChains << "\n");
1179   }
1180 
1181   /// Compute the gain of merging two chains.
1182   ///
1183   /// The function considers all possible ways of merging two chains and
1184   /// computes the one having the largest increase in ExtTSP objective. The
1185   /// result is a pair with the first element being the gain and the second
1186   /// element being the corresponding merging type.
1187   MergeGainT getBestMergeGain(ChainEdge *Edge) const {
1188     assert(!Edge->jumps().empty() && "trying to merge chains w/o jumps");
1189     // Precompute jumps between ChainPred and ChainSucc.
1190     MergedJumpsT Jumps(&Edge->jumps());
1191     ChainT *SrcChain = Edge->srcChain();
1192     ChainT *DstChain = Edge->dstChain();
1193 
1194     // This object holds the best currently chosen gain of merging two chains.
1195     MergeGainT Gain = MergeGainT();
1196 
1197     /// Given a list of merge types, try to merge two chains and update Gain
1198     /// with a better alternative.
1199     auto tryChainMerging = [&](const std::vector<MergeTypeT> &MergeTypes) {
1200       // Apply the merge, compute the corresponding gain, and update the best
1201       // value, if the merge is beneficial.
1202       for (const MergeTypeT &MergeType : MergeTypes) {
1203         MergeGainT NewGain =
1204             computeMergeGain(SrcChain, DstChain, Jumps, MergeType);
1205 
1206         // When forward and backward gains are the same, prioritize merging that
1207         // preserves the original order of the functions in the binary.
1208         if (std::abs(Gain.score() - NewGain.score()) < EPS) {
1209           if ((MergeType == MergeTypeT::X_Y && SrcChain->Id < DstChain->Id) ||
1210               (MergeType == MergeTypeT::Y_X && SrcChain->Id > DstChain->Id)) {
1211             Gain = NewGain;
1212           }
1213         } else if (NewGain.score() > Gain.score() + EPS) {
1214           Gain = NewGain;
1215         }
1216       }
1217     };
1218 
1219     // Try to concatenate two chains w/o splitting.
1220     tryChainMerging({MergeTypeT::X_Y, MergeTypeT::Y_X});
1221 
1222     return Gain;
1223   }
1224 
1225   /// Compute the score gain of merging two chains, respecting a given type.
1226   ///
1227   /// The two chains are not modified in the method.
1228   MergeGainT computeMergeGain(ChainT *ChainPred, ChainT *ChainSucc,
1229                               const MergedJumpsT &Jumps,
1230                               MergeTypeT MergeType) const {
1231     // This doesn't depend on the ordering of the nodes
1232     double FreqGain = freqBasedLocalityGain(ChainPred, ChainSucc);
1233 
1234     // Merge offset is always 0, as the chains are not split.
1235     size_t MergeOffset = 0;
1236     auto MergedBlocks =
1237         mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType);
1238     double DistGain = distBasedLocalityGain(MergedBlocks, Jumps);
1239 
1240     double GainScore = DistGain + Config.FrequencyScale * FreqGain;
1241     // Scale the result to increase the importance of merging short chains.
1242     if (GainScore >= 0.0)
1243       GainScore /= std::min(ChainPred->Size, ChainSucc->Size);
1244 
1245     return MergeGainT(GainScore, MergeOffset, MergeType);
1246   }
1247 
1248   /// Compute the change of the frequency locality after merging the chains.
1249   double freqBasedLocalityGain(ChainT *ChainPred, ChainT *ChainSucc) const {
1250     auto missProbability = [&](double ChainDensity) {
1251       double PageSamples = ChainDensity * Config.CacheSize;
1252       if (PageSamples >= TotalSamples)
1253         return 0.0;
1254       double P = PageSamples / TotalSamples;
1255       return pow(1.0 - P, static_cast<double>(Config.CacheEntries));
1256     };
1257 
1258     // Cache misses on the chains before merging.
1259     double CurScore =
1260         ChainPred->ExecutionCount * missProbability(ChainPred->density()) +
1261         ChainSucc->ExecutionCount * missProbability(ChainSucc->density());
1262 
1263     // Cache misses on the merged chain
1264     double MergedCounts = ChainPred->ExecutionCount + ChainSucc->ExecutionCount;
1265     double MergedSize = ChainPred->Size + ChainSucc->Size;
1266     double MergedDensity = static_cast<double>(MergedCounts) / MergedSize;
1267     double NewScore = MergedCounts * missProbability(MergedDensity);
1268 
1269     return CurScore - NewScore;
1270   }
1271 
1272   /// Compute the distance locality for a jump / call.
1273   double distScore(uint64_t SrcAddr, uint64_t DstAddr, uint64_t Count) const {
1274     uint64_t Dist = SrcAddr <= DstAddr ? DstAddr - SrcAddr : SrcAddr - DstAddr;
1275     double D = Dist == 0 ? 0.1 : static_cast<double>(Dist);
1276     return static_cast<double>(Count) * std::pow(D, -Config.DistancePower);
1277   }
1278 
1279   /// Compute the change of the distance locality after merging the chains.
1280   double distBasedLocalityGain(const MergedNodesT &Nodes,
1281                                const MergedJumpsT &Jumps) const {
1282     uint64_t CurAddr = 0;
1283     Nodes.forEach([&](const NodeT *Node) {
1284       Node->EstimatedAddr = CurAddr;
1285       CurAddr += Node->Size;
1286     });
1287 
1288     double CurScore = 0;
1289     double NewScore = 0;
1290     Jumps.forEach([&](const JumpT *Jump) {
1291       uint64_t SrcAddr = Jump->Source->EstimatedAddr + Jump->Offset;
1292       uint64_t DstAddr = Jump->Target->EstimatedAddr;
1293       NewScore += distScore(SrcAddr, DstAddr, Jump->ExecutionCount);
1294       CurScore += distScore(0, TotalSize, Jump->ExecutionCount);
1295     });
1296     return NewScore - CurScore;
1297   }
1298 
1299   /// Merge chain From into chain Into, update the list of active chains,
1300   /// adjacency information, and the corresponding cached values.
1301   void mergeChains(ChainT *Into, ChainT *From, size_t MergeOffset,
1302                    MergeTypeT MergeType) {
1303     assert(Into != From && "a chain cannot be merged with itself");
1304 
1305     // Merge the nodes.
1306     MergedNodesT MergedNodes =
1307         mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType);
1308     Into->merge(From, MergedNodes.getNodes());
1309 
1310     // Merge the edges.
1311     Into->mergeEdges(From);
1312     From->clear();
1313   }
1314 
1315   /// Concatenate all chains into the final order.
1316   std::vector<uint64_t> concatChains() {
1317     // Collect chains and calculate density stats for their sorting.
1318     std::vector<const ChainT *> SortedChains;
1319     DenseMap<const ChainT *, double> ChainDensity;
1320     for (ChainT &Chain : AllChains) {
1321       if (!Chain.Nodes.empty()) {
1322         SortedChains.push_back(&Chain);
1323         // Using doubles to avoid overflow of ExecutionCounts.
1324         double Size = 0;
1325         double ExecutionCount = 0;
1326         for (NodeT *Node : Chain.Nodes) {
1327           Size += static_cast<double>(Node->Size);
1328           ExecutionCount += static_cast<double>(Node->ExecutionCount);
1329         }
1330         assert(Size > 0 && "a chain of zero size");
1331         ChainDensity[&Chain] = ExecutionCount / Size;
1332       }
1333     }
1334 
1335     // Sort chains by density in the decreasing order.
1336     std::sort(SortedChains.begin(), SortedChains.end(),
1337               [&](const ChainT *L, const ChainT *R) {
1338                 const double DL = ChainDensity[L];
1339                 const double DR = ChainDensity[R];
1340                 // Compare by density and break ties by chain identifiers.
1341                 return std::make_tuple(-DL, L->Id) <
1342                        std::make_tuple(-DR, R->Id);
1343               });
1344 
1345     // Collect the nodes in the order specified by their chains.
1346     std::vector<uint64_t> Order;
1347     Order.reserve(NumNodes);
1348     for (const ChainT *Chain : SortedChains)
1349       for (NodeT *Node : Chain->Nodes)
1350         Order.push_back(Node->Index);
1351     return Order;
1352   }
1353 
1354 private:
1355   /// Config for the algorithm.
1356   const CDSortConfig Config;
1357 
1358   /// The number of nodes in the graph.
1359   const size_t NumNodes;
1360 
1361   /// Successors of each node.
1362   std::vector<std::vector<uint64_t>> SuccNodes;
1363 
1364   /// Predecessors of each node.
1365   std::vector<std::vector<uint64_t>> PredNodes;
1366 
1367   /// All nodes (functions) in the graph.
1368   std::vector<NodeT> AllNodes;
1369 
1370   /// All jumps (function calls) between the nodes.
1371   std::vector<JumpT> AllJumps;
1372 
1373   /// All chains of nodes.
1374   std::vector<ChainT> AllChains;
1375 
1376   /// All edges between the chains.
1377   std::vector<ChainEdge> AllEdges;
1378 
1379   /// The total number of samples in the graph.
1380   uint64_t TotalSamples{0};
1381 
1382   /// The total size of the nodes in the graph.
1383   uint64_t TotalSize{0};
1384 };
1385 
1386 } // end of anonymous namespace
1387 
1388 std::vector<uint64_t>
1389 codelayout::computeExtTspLayout(ArrayRef<uint64_t> NodeSizes,
1390                                 ArrayRef<uint64_t> NodeCounts,
1391                                 ArrayRef<EdgeCount> EdgeCounts) {
1392   // Verify correctness of the input data.
1393   assert(NodeCounts.size() == NodeSizes.size() && "Incorrect input");
1394   assert(NodeSizes.size() > 2 && "Incorrect input");
1395 
1396   // Apply the reordering algorithm.
1397   ExtTSPImpl Alg(NodeSizes, NodeCounts, EdgeCounts);
1398   std::vector<uint64_t> Result = Alg.run();
1399 
1400   // Verify correctness of the output.
1401   assert(Result.front() == 0 && "Original entry point is not preserved");
1402   assert(Result.size() == NodeSizes.size() && "Incorrect size of layout");
1403   return Result;
1404 }
1405 
1406 double codelayout::calcExtTspScore(ArrayRef<uint64_t> Order,
1407                                    ArrayRef<uint64_t> NodeSizes,
1408                                    ArrayRef<uint64_t> NodeCounts,
1409                                    ArrayRef<EdgeCount> EdgeCounts) {
1410   // Estimate addresses of the blocks in memory.
1411   std::vector<uint64_t> Addr(NodeSizes.size(), 0);
1412   for (size_t Idx = 1; Idx < Order.size(); Idx++) {
1413     Addr[Order[Idx]] = Addr[Order[Idx - 1]] + NodeSizes[Order[Idx - 1]];
1414   }
1415   std::vector<uint64_t> OutDegree(NodeSizes.size(), 0);
1416   for (auto Edge : EdgeCounts)
1417     ++OutDegree[Edge.src];
1418 
1419   // Increase the score for each jump.
1420   double Score = 0;
1421   for (auto Edge : EdgeCounts) {
1422     bool IsConditional = OutDegree[Edge.src] > 1;
1423     Score += ::extTSPScore(Addr[Edge.src], NodeSizes[Edge.src], Addr[Edge.dst],
1424                            Edge.count, IsConditional);
1425   }
1426   return Score;
1427 }
1428 
1429 double codelayout::calcExtTspScore(ArrayRef<uint64_t> NodeSizes,
1430                                    ArrayRef<uint64_t> NodeCounts,
1431                                    ArrayRef<EdgeCount> EdgeCounts) {
1432   std::vector<uint64_t> Order(NodeSizes.size());
1433   for (size_t Idx = 0; Idx < NodeSizes.size(); Idx++) {
1434     Order[Idx] = Idx;
1435   }
1436   return calcExtTspScore(Order, NodeSizes, NodeCounts, EdgeCounts);
1437 }
1438 
1439 std::vector<uint64_t> codelayout::computeCacheDirectedLayout(
1440     const CDSortConfig &Config, ArrayRef<uint64_t> FuncSizes,
1441     ArrayRef<uint64_t> FuncCounts, ArrayRef<EdgeCount> CallCounts,
1442     ArrayRef<uint64_t> CallOffsets) {
1443   // Verify correctness of the input data.
1444   assert(FuncCounts.size() == FuncSizes.size() && "Incorrect input");
1445 
1446   // Apply the reordering algorithm.
1447   CDSortImpl Alg(Config, FuncSizes, FuncCounts, CallCounts, CallOffsets);
1448   std::vector<uint64_t> Result = Alg.run();
1449   assert(Result.size() == FuncSizes.size() && "Incorrect size of layout");
1450   return Result;
1451 }
1452 
1453 std::vector<uint64_t> codelayout::computeCacheDirectedLayout(
1454     ArrayRef<uint64_t> FuncSizes, ArrayRef<uint64_t> FuncCounts,
1455     ArrayRef<EdgeCount> CallCounts, ArrayRef<uint64_t> CallOffsets) {
1456   CDSortConfig Config;
1457   // Populate the config from the command-line options.
1458   if (CacheEntries.getNumOccurrences() > 0)
1459     Config.CacheEntries = CacheEntries;
1460   if (CacheSize.getNumOccurrences() > 0)
1461     Config.CacheSize = CacheSize;
1462   if (CDMaxChainSize.getNumOccurrences() > 0)
1463     Config.MaxChainSize = CDMaxChainSize;
1464   if (DistancePower.getNumOccurrences() > 0)
1465     Config.DistancePower = DistancePower;
1466   if (FrequencyScale.getNumOccurrences() > 0)
1467     Config.FrequencyScale = FrequencyScale;
1468   return computeCacheDirectedLayout(Config, FuncSizes, FuncCounts, CallCounts,
1469                                     CallOffsets);
1470 }
1471