xref: /llvm-project/llvm/lib/Transforms/Utils/CodeLayout.cpp (revision 1e6921131aa46fae65344f057c6bb40610a43443)
1 //===- CodeLayout.cpp - Implementation of code layout algorithms ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // ExtTSP - layout of basic blocks with i-cache optimization.
10 //
11 // The algorithm tries to find a layout of nodes (basic blocks) of a given CFG
12 // optimizing jump locality and thus processor I-cache utilization. This is
13 // achieved via increasing the number of fall-through jumps and co-locating
14 // frequently executed nodes together. The name follows the underlying
15 // optimization problem, Extended-TSP, which is a generalization of classical
16 // (maximum) Traveling Salesmen Problem.
17 //
18 // The algorithm is a greedy heuristic that works with chains (ordered lists)
19 // of basic blocks. Initially all chains are isolated basic blocks. On every
20 // iteration, we pick a pair of chains whose merging yields the biggest increase
21 // in the ExtTSP score, which models how i-cache "friendly" a specific chain is.
22 // A pair of chains giving the maximum gain is merged into a new chain. The
23 // procedure stops when there is only one chain left, or when merging does not
24 // increase ExtTSP. In the latter case, the remaining chains are sorted by
25 // density in the decreasing order.
26 //
27 // An important aspect is the way two chains are merged. Unlike earlier
28 // algorithms (e.g., based on the approach of Pettis-Hansen), two
29 // chains, X and Y, are first split into three, X1, X2, and Y. Then we
30 // consider all possible ways of gluing the three chains (e.g., X1YX2, X1X2Y,
31 // X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the largest score.
32 // This improves the quality of the final result (the search space is larger)
33 // while keeping the implementation sufficiently fast.
34 //
35 // Reference:
36 //   * A. Newell and S. Pupyrev, Improved Basic Block Reordering,
37 //     IEEE Transactions on Computers, 2020
38 //     https://arxiv.org/abs/1809.04676
39 //
40 //===----------------------------------------------------------------------===//
41 
42 #include "llvm/Transforms/Utils/CodeLayout.h"
43 #include "llvm/Support/CommandLine.h"
44 
45 #include <cmath>
46 
47 using namespace llvm;
48 #define DEBUG_TYPE "code-layout"
49 
50 namespace llvm {
51 cl::opt<bool> EnableExtTspBlockPlacement(
52     "enable-ext-tsp-block-placement", cl::Hidden, cl::init(false),
53     cl::desc("Enable machine block placement based on the ext-tsp model, "
54              "optimizing I-cache utilization."));
55 
56 cl::opt<bool> ApplyExtTspWithoutProfile(
57     "ext-tsp-apply-without-profile",
58     cl::desc("Whether to apply ext-tsp placement for instances w/o profile"),
59     cl::init(true), cl::Hidden);
60 } // namespace llvm
61 
62 // Algorithm-specific params. The values are tuned for the best performance
63 // of large-scale front-end bound binaries.
64 static cl::opt<double> ForwardWeightCond(
65     "ext-tsp-forward-weight-cond", cl::ReallyHidden, cl::init(0.1),
66     cl::desc("The weight of conditional forward jumps for ExtTSP value"));
67 
68 static cl::opt<double> ForwardWeightUncond(
69     "ext-tsp-forward-weight-uncond", cl::ReallyHidden, cl::init(0.1),
70     cl::desc("The weight of unconditional forward jumps for ExtTSP value"));
71 
72 static cl::opt<double> BackwardWeightCond(
73     "ext-tsp-backward-weight-cond", cl::ReallyHidden, cl::init(0.1),
74     cl::desc("The weight of conditonal backward jumps for ExtTSP value"));
75 
76 static cl::opt<double> BackwardWeightUncond(
77     "ext-tsp-backward-weight-uncond", cl::ReallyHidden, cl::init(0.1),
78     cl::desc("The weight of unconditonal backward jumps for ExtTSP value"));
79 
80 static cl::opt<double> FallthroughWeightCond(
81     "ext-tsp-fallthrough-weight-cond", cl::ReallyHidden, cl::init(1.0),
82     cl::desc("The weight of conditional fallthrough jumps for ExtTSP value"));
83 
84 static cl::opt<double> FallthroughWeightUncond(
85     "ext-tsp-fallthrough-weight-uncond", cl::ReallyHidden, cl::init(1.05),
86     cl::desc("The weight of unconditional fallthrough jumps for ExtTSP value"));
87 
88 static cl::opt<unsigned> ForwardDistance(
89     "ext-tsp-forward-distance", cl::ReallyHidden, cl::init(1024),
90     cl::desc("The maximum distance (in bytes) of a forward jump for ExtTSP"));
91 
92 static cl::opt<unsigned> BackwardDistance(
93     "ext-tsp-backward-distance", cl::ReallyHidden, cl::init(640),
94     cl::desc("The maximum distance (in bytes) of a backward jump for ExtTSP"));
95 
96 // The maximum size of a chain created by the algorithm. The size is bounded
97 // so that the algorithm can efficiently process extremely large instance.
98 static cl::opt<unsigned>
99     MaxChainSize("ext-tsp-max-chain-size", cl::ReallyHidden, cl::init(4096),
100                  cl::desc("The maximum size of a chain to create."));
101 
102 // The maximum size of a chain for splitting. Larger values of the threshold
103 // may yield better quality at the cost of worsen run-time.
104 static cl::opt<unsigned> ChainSplitThreshold(
105     "ext-tsp-chain-split-threshold", cl::ReallyHidden, cl::init(128),
106     cl::desc("The maximum size of a chain to apply splitting"));
107 
108 // The option enables splitting (large) chains along in-coming and out-going
109 // jumps. This typically results in a better quality.
110 static cl::opt<bool> EnableChainSplitAlongJumps(
111     "ext-tsp-enable-chain-split-along-jumps", cl::ReallyHidden, cl::init(true),
112     cl::desc("The maximum size of a chain to apply splitting"));
113 
114 namespace {
115 
116 // Epsilon for comparison of doubles.
117 constexpr double EPS = 1e-8;
118 
119 // Compute the Ext-TSP score for a given jump.
120 double jumpExtTSPScore(uint64_t JumpDist, uint64_t JumpMaxDist, uint64_t Count,
121                        double Weight) {
122   if (JumpDist > JumpMaxDist)
123     return 0;
124   double Prob = 1.0 - static_cast<double>(JumpDist) / JumpMaxDist;
125   return Weight * Prob * Count;
126 }
127 
128 // Compute the Ext-TSP score for a jump between a given pair of blocks,
129 // using their sizes, (estimated) addresses and the jump execution count.
130 double extTSPScore(uint64_t SrcAddr, uint64_t SrcSize, uint64_t DstAddr,
131                    uint64_t Count, bool IsConditional) {
132   // Fallthrough
133   if (SrcAddr + SrcSize == DstAddr) {
134     return jumpExtTSPScore(0, 1, Count,
135                            IsConditional ? FallthroughWeightCond
136                                          : FallthroughWeightUncond);
137   }
138   // Forward
139   if (SrcAddr + SrcSize < DstAddr) {
140     const uint64_t Dist = DstAddr - (SrcAddr + SrcSize);
141     return jumpExtTSPScore(Dist, ForwardDistance, Count,
142                            IsConditional ? ForwardWeightCond
143                                          : ForwardWeightUncond);
144   }
145   // Backward
146   const uint64_t Dist = SrcAddr + SrcSize - DstAddr;
147   return jumpExtTSPScore(Dist, BackwardDistance, Count,
148                          IsConditional ? BackwardWeightCond
149                                        : BackwardWeightUncond);
150 }
151 
152 /// A type of merging two chains, X and Y. The former chain is split into
153 /// X1 and X2 and then concatenated with Y in the order specified by the type.
154 enum class MergeTypeTy : int { X_Y, X1_Y_X2, Y_X2_X1, X2_X1_Y };
155 
156 /// The gain of merging two chains, that is, the Ext-TSP score of the merge
157 /// together with the corresponfiding merge 'type' and 'offset'.
158 class MergeGainTy {
159 public:
160   explicit MergeGainTy() = default;
161   explicit MergeGainTy(double Score, size_t MergeOffset, MergeTypeTy MergeType)
162       : Score(Score), MergeOffset(MergeOffset), MergeType(MergeType) {}
163 
164   double score() const { return Score; }
165 
166   size_t mergeOffset() const { return MergeOffset; }
167 
168   MergeTypeTy mergeType() const { return MergeType; }
169 
170   // Returns 'true' iff Other is preferred over this.
171   bool operator<(const MergeGainTy &Other) const {
172     return (Other.Score > EPS && Other.Score > Score + EPS);
173   }
174 
175   // Update the current gain if Other is preferred over this.
176   void updateIfLessThan(const MergeGainTy &Other) {
177     if (*this < Other)
178       *this = Other;
179   }
180 
181 private:
182   double Score{-1.0};
183   size_t MergeOffset{0};
184   MergeTypeTy MergeType{MergeTypeTy::X_Y};
185 };
186 
187 class Jump;
188 class Chain;
189 class ChainEdge;
190 
191 /// A node in the graph, typically corresponding to a basic block in CFG.
192 class Block {
193 public:
194   Block(const Block &) = delete;
195   Block(Block &&) = default;
196   Block &operator=(const Block &) = delete;
197   Block &operator=(Block &&) = default;
198 
199   // The original index of the block in CFG.
200   size_t Index{0};
201   // The index of the block in the current chain.
202   size_t CurIndex{0};
203   // Size of the block in the binary.
204   uint64_t Size{0};
205   // Execution count of the block in the profile data.
206   uint64_t ExecutionCount{0};
207   // Current chain of the node.
208   Chain *CurChain{nullptr};
209   // An offset of the block in the current chain.
210   mutable uint64_t EstimatedAddr{0};
211   // Forced successor of the block in CFG.
212   Block *ForcedSucc{nullptr};
213   // Forced predecessor of the block in CFG.
214   Block *ForcedPred{nullptr};
215   // Outgoing jumps from the block.
216   std::vector<Jump *> OutJumps;
217   // Incoming jumps to the block.
218   std::vector<Jump *> InJumps;
219 
220 public:
221   explicit Block(size_t Index, uint64_t Size, uint64_t EC)
222       : Index(Index), Size(Size), ExecutionCount(EC) {}
223   bool isEntry() const { return Index == 0; }
224 };
225 
226 /// An arc in the graph, typically corresponding to a jump between two blocks.
227 class Jump {
228 public:
229   Jump(const Jump &) = delete;
230   Jump(Jump &&) = default;
231   Jump &operator=(const Jump &) = delete;
232   Jump &operator=(Jump &&) = default;
233 
234   // Source block of the jump.
235   Block *Source;
236   // Target block of the jump.
237   Block *Target;
238   // Execution count of the arc in the profile data.
239   uint64_t ExecutionCount{0};
240   // Whether the jump corresponds to a conditional branch.
241   bool IsConditional{false};
242 
243 public:
244   explicit Jump(Block *Source, Block *Target, uint64_t ExecutionCount)
245       : Source(Source), Target(Target), ExecutionCount(ExecutionCount) {}
246 };
247 
248 /// A chain (ordered sequence) of blocks.
249 class Chain {
250 public:
251   Chain(const Chain &) = delete;
252   Chain(Chain &&) = default;
253   Chain &operator=(const Chain &) = delete;
254   Chain &operator=(Chain &&) = default;
255 
256   explicit Chain(uint64_t Id, Block *Block)
257       : Id(Id), Score(0), Blocks(1, Block) {}
258 
259   uint64_t id() const { return Id; }
260 
261   bool isEntry() const { return Blocks[0]->Index == 0; }
262 
263   bool isCold() const {
264     for (auto *Block : Blocks) {
265       if (Block->ExecutionCount > 0)
266         return false;
267     }
268     return true;
269   }
270 
271   double score() const { return Score; }
272 
273   void setScore(double NewScore) { Score = NewScore; }
274 
275   const std::vector<Block *> &blocks() const { return Blocks; }
276 
277   size_t numBlocks() const { return Blocks.size(); }
278 
279   const std::vector<std::pair<Chain *, ChainEdge *>> &edges() const {
280     return Edges;
281   }
282 
283   ChainEdge *getEdge(Chain *Other) const {
284     for (auto It : Edges) {
285       if (It.first == Other)
286         return It.second;
287     }
288     return nullptr;
289   }
290 
291   void removeEdge(Chain *Other) {
292     auto It = Edges.begin();
293     while (It != Edges.end()) {
294       if (It->first == Other) {
295         Edges.erase(It);
296         return;
297       }
298       It++;
299     }
300   }
301 
302   void addEdge(Chain *Other, ChainEdge *Edge) {
303     Edges.push_back(std::make_pair(Other, Edge));
304   }
305 
306   void merge(Chain *Other, const std::vector<Block *> &MergedBlocks) {
307     Blocks = MergedBlocks;
308     // Update the block's chains
309     for (size_t Idx = 0; Idx < Blocks.size(); Idx++) {
310       Blocks[Idx]->CurChain = this;
311       Blocks[Idx]->CurIndex = Idx;
312     }
313   }
314 
315   void mergeEdges(Chain *Other);
316 
317   void clear() {
318     Blocks.clear();
319     Blocks.shrink_to_fit();
320     Edges.clear();
321     Edges.shrink_to_fit();
322   }
323 
324 private:
325   // Unique chain identifier.
326   uint64_t Id;
327   // Cached ext-tsp score for the chain.
328   double Score;
329   // Blocks of the chain.
330   std::vector<Block *> Blocks;
331   // Adjacent chains and corresponding edges (lists of jumps).
332   std::vector<std::pair<Chain *, ChainEdge *>> Edges;
333 };
334 
335 /// An edge in CFG representing jumps between two chains.
336 /// When blocks are merged into chains, the edges are combined too so that
337 /// there is always at most one edge between a pair of chains
338 class ChainEdge {
339 public:
340   ChainEdge(const ChainEdge &) = delete;
341   ChainEdge(ChainEdge &&) = default;
342   ChainEdge &operator=(const ChainEdge &) = delete;
343   ChainEdge &operator=(ChainEdge &&) = default;
344 
345   explicit ChainEdge(Jump *Jump)
346       : SrcChain(Jump->Source->CurChain), DstChain(Jump->Target->CurChain),
347         Jumps(1, Jump) {}
348 
349   const std::vector<Jump *> &jumps() const { return Jumps; }
350 
351   void changeEndpoint(Chain *From, Chain *To) {
352     if (From == SrcChain)
353       SrcChain = To;
354     if (From == DstChain)
355       DstChain = To;
356   }
357 
358   void appendJump(Jump *Jump) { Jumps.push_back(Jump); }
359 
360   void moveJumps(ChainEdge *Other) {
361     Jumps.insert(Jumps.end(), Other->Jumps.begin(), Other->Jumps.end());
362     Other->Jumps.clear();
363     Other->Jumps.shrink_to_fit();
364   }
365 
366   bool hasCachedMergeGain(Chain *Src, Chain *Dst) const {
367     return Src == SrcChain ? CacheValidForward : CacheValidBackward;
368   }
369 
370   MergeGainTy getCachedMergeGain(Chain *Src, Chain *Dst) const {
371     return Src == SrcChain ? CachedGainForward : CachedGainBackward;
372   }
373 
374   void setCachedMergeGain(Chain *Src, Chain *Dst, MergeGainTy MergeGain) {
375     if (Src == SrcChain) {
376       CachedGainForward = MergeGain;
377       CacheValidForward = true;
378     } else {
379       CachedGainBackward = MergeGain;
380       CacheValidBackward = true;
381     }
382   }
383 
384   void invalidateCache() {
385     CacheValidForward = false;
386     CacheValidBackward = false;
387   }
388 
389 private:
390   // Source chain.
391   Chain *SrcChain{nullptr};
392   // Destination chain.
393   Chain *DstChain{nullptr};
394   // Original jumps in the binary with correspinding execution counts.
395   std::vector<Jump *> Jumps;
396   // Cached ext-tsp value for merging the pair of chains.
397   // Since the gain of merging (Src, Dst) and (Dst, Src) might be different,
398   // we store both values here.
399   MergeGainTy CachedGainForward;
400   MergeGainTy CachedGainBackward;
401   // Whether the cached value must be recomputed.
402   bool CacheValidForward{false};
403   bool CacheValidBackward{false};
404 };
405 
406 void Chain::mergeEdges(Chain *Other) {
407   assert(this != Other && "cannot merge a chain with itself");
408 
409   // Update edges adjacent to chain Other
410   for (auto EdgeIt : Other->Edges) {
411     Chain *DstChain = EdgeIt.first;
412     ChainEdge *DstEdge = EdgeIt.second;
413     Chain *TargetChain = DstChain == Other ? this : DstChain;
414     ChainEdge *CurEdge = getEdge(TargetChain);
415     if (CurEdge == nullptr) {
416       DstEdge->changeEndpoint(Other, this);
417       this->addEdge(TargetChain, DstEdge);
418       if (DstChain != this && DstChain != Other) {
419         DstChain->addEdge(this, DstEdge);
420       }
421     } else {
422       CurEdge->moveJumps(DstEdge);
423     }
424     // Cleanup leftover edge
425     if (DstChain != Other) {
426       DstChain->removeEdge(Other);
427     }
428   }
429 }
430 
431 using BlockIter = std::vector<Block *>::const_iterator;
432 
433 /// A wrapper around three chains of blocks; it is used to avoid extra
434 /// instantiation of the vectors.
435 class MergedChain {
436 public:
437   MergedChain(BlockIter Begin1, BlockIter End1, BlockIter Begin2 = BlockIter(),
438               BlockIter End2 = BlockIter(), BlockIter Begin3 = BlockIter(),
439               BlockIter End3 = BlockIter())
440       : Begin1(Begin1), End1(End1), Begin2(Begin2), End2(End2), Begin3(Begin3),
441         End3(End3) {}
442 
443   template <typename F> void forEach(const F &Func) const {
444     for (auto It = Begin1; It != End1; It++)
445       Func(*It);
446     for (auto It = Begin2; It != End2; It++)
447       Func(*It);
448     for (auto It = Begin3; It != End3; It++)
449       Func(*It);
450   }
451 
452   std::vector<Block *> getBlocks() const {
453     std::vector<Block *> Result;
454     Result.reserve(std::distance(Begin1, End1) + std::distance(Begin2, End2) +
455                    std::distance(Begin3, End3));
456     Result.insert(Result.end(), Begin1, End1);
457     Result.insert(Result.end(), Begin2, End2);
458     Result.insert(Result.end(), Begin3, End3);
459     return Result;
460   }
461 
462   const Block *getFirstBlock() const { return *Begin1; }
463 
464 private:
465   BlockIter Begin1;
466   BlockIter End1;
467   BlockIter Begin2;
468   BlockIter End2;
469   BlockIter Begin3;
470   BlockIter End3;
471 };
472 
473 /// The implementation of the ExtTSP algorithm.
474 class ExtTSPImpl {
475   using EdgeT = std::pair<uint64_t, uint64_t>;
476   using EdgeCountMap = std::vector<std::pair<EdgeT, uint64_t>>;
477 
478 public:
479   ExtTSPImpl(size_t NumNodes, const std::vector<uint64_t> &NodeSizes,
480              const std::vector<uint64_t> &NodeCounts,
481              const EdgeCountMap &EdgeCounts)
482       : NumNodes(NumNodes) {
483     initialize(NodeSizes, NodeCounts, EdgeCounts);
484   }
485 
486   /// Run the algorithm and return an optimized ordering of blocks.
487   void run(std::vector<uint64_t> &Result) {
488     // Pass 1: Merge blocks with their mutually forced successors
489     mergeForcedPairs();
490 
491     // Pass 2: Merge pairs of chains while improving the ExtTSP objective
492     mergeChainPairs();
493 
494     // Pass 3: Merge cold blocks to reduce code size
495     mergeColdChains();
496 
497     // Collect blocks from all chains
498     concatChains(Result);
499   }
500 
501 private:
502   /// Initialize the algorithm's data structures.
503   void initialize(const std::vector<uint64_t> &NodeSizes,
504                   const std::vector<uint64_t> &NodeCounts,
505                   const EdgeCountMap &EdgeCounts) {
506     // Initialize blocks
507     AllBlocks.reserve(NumNodes);
508     for (uint64_t Node = 0; Node < NumNodes; Node++) {
509       uint64_t Size = std::max<uint64_t>(NodeSizes[Node], 1ULL);
510       uint64_t ExecutionCount = NodeCounts[Node];
511       // The execution count of the entry block is set to at least 1
512       if (Node == 0 && ExecutionCount == 0)
513         ExecutionCount = 1;
514       AllBlocks.emplace_back(Node, Size, ExecutionCount);
515     }
516 
517     // Initialize jumps between blocks
518     SuccNodes.resize(NumNodes);
519     PredNodes.resize(NumNodes);
520     std::vector<uint64_t> OutDegree(NumNodes, 0);
521     AllJumps.reserve(EdgeCounts.size());
522     for (auto It : EdgeCounts) {
523       auto Pred = It.first.first;
524       auto Succ = It.first.second;
525       OutDegree[Pred]++;
526       // Ignore self-edges
527       if (Pred == Succ)
528         continue;
529 
530       SuccNodes[Pred].push_back(Succ);
531       PredNodes[Succ].push_back(Pred);
532       auto ExecutionCount = It.second;
533       if (ExecutionCount > 0) {
534         auto &Block = AllBlocks[Pred];
535         auto &SuccBlock = AllBlocks[Succ];
536         AllJumps.emplace_back(&Block, &SuccBlock, ExecutionCount);
537         SuccBlock.InJumps.push_back(&AllJumps.back());
538         Block.OutJumps.push_back(&AllJumps.back());
539       }
540     }
541     for (auto &Jump : AllJumps) {
542       assert(OutDegree[Jump.Source->Index] > 0);
543       Jump.IsConditional = OutDegree[Jump.Source->Index] > 1;
544     }
545 
546     // Initialize chains
547     AllChains.reserve(NumNodes);
548     HotChains.reserve(NumNodes);
549     for (Block &Block : AllBlocks) {
550       AllChains.emplace_back(Block.Index, &Block);
551       Block.CurChain = &AllChains.back();
552       if (Block.ExecutionCount > 0) {
553         HotChains.push_back(&AllChains.back());
554       }
555     }
556 
557     // Initialize chain edges
558     AllEdges.reserve(AllJumps.size());
559     for (Block &Block : AllBlocks) {
560       for (auto &Jump : Block.OutJumps) {
561         auto SuccBlock = Jump->Target;
562         ChainEdge *CurEdge = Block.CurChain->getEdge(SuccBlock->CurChain);
563         // this edge is already present in the graph
564         if (CurEdge != nullptr) {
565           assert(SuccBlock->CurChain->getEdge(Block.CurChain) != nullptr);
566           CurEdge->appendJump(Jump);
567           continue;
568         }
569         // this is a new edge
570         AllEdges.emplace_back(Jump);
571         Block.CurChain->addEdge(SuccBlock->CurChain, &AllEdges.back());
572         SuccBlock->CurChain->addEdge(Block.CurChain, &AllEdges.back());
573       }
574     }
575   }
576 
577   /// For a pair of blocks, A and B, block B is the forced successor of A,
578   /// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps
579   /// to B are from A. Such blocks should be adjacent in the optimal ordering;
580   /// the method finds and merges such pairs of blocks.
581   void mergeForcedPairs() {
582     // Find fallthroughs based on edge weights
583     for (auto &Block : AllBlocks) {
584       if (SuccNodes[Block.Index].size() == 1 &&
585           PredNodes[SuccNodes[Block.Index][0]].size() == 1 &&
586           SuccNodes[Block.Index][0] != 0) {
587         size_t SuccIndex = SuccNodes[Block.Index][0];
588         Block.ForcedSucc = &AllBlocks[SuccIndex];
589         AllBlocks[SuccIndex].ForcedPred = &Block;
590       }
591     }
592 
593     // There might be 'cycles' in the forced dependencies, since profile
594     // data isn't 100% accurate. Typically this is observed in loops, when the
595     // loop edges are the hottest successors for the basic blocks of the loop.
596     // Break the cycles by choosing the block with the smallest index as the
597     // head. This helps to keep the original order of the loops, which likely
598     // have already been rotated in the optimized manner.
599     for (auto &Block : AllBlocks) {
600       if (Block.ForcedSucc == nullptr || Block.ForcedPred == nullptr)
601         continue;
602 
603       auto SuccBlock = Block.ForcedSucc;
604       while (SuccBlock != nullptr && SuccBlock != &Block) {
605         SuccBlock = SuccBlock->ForcedSucc;
606       }
607       if (SuccBlock == nullptr)
608         continue;
609       // Break the cycle
610       AllBlocks[Block.ForcedPred->Index].ForcedSucc = nullptr;
611       Block.ForcedPred = nullptr;
612     }
613 
614     // Merge blocks with their fallthrough successors
615     for (auto &Block : AllBlocks) {
616       if (Block.ForcedPred == nullptr && Block.ForcedSucc != nullptr) {
617         auto CurBlock = &Block;
618         while (CurBlock->ForcedSucc != nullptr) {
619           const auto NextBlock = CurBlock->ForcedSucc;
620           mergeChains(Block.CurChain, NextBlock->CurChain, 0, MergeTypeTy::X_Y);
621           CurBlock = NextBlock;
622         }
623       }
624     }
625   }
626 
627   /// Merge pairs of chains while improving the ExtTSP objective.
628   void mergeChainPairs() {
629     /// Deterministically compare pairs of chains
630     auto compareChainPairs = [](const Chain *A1, const Chain *B1,
631                                 const Chain *A2, const Chain *B2) {
632       if (A1 != A2)
633         return A1->id() < A2->id();
634       return B1->id() < B2->id();
635     };
636 
637     while (HotChains.size() > 1) {
638       Chain *BestChainPred = nullptr;
639       Chain *BestChainSucc = nullptr;
640       auto BestGain = MergeGainTy();
641       // Iterate over all pairs of chains
642       for (Chain *ChainPred : HotChains) {
643         // Get candidates for merging with the current chain
644         for (auto EdgeIter : ChainPred->edges()) {
645           Chain *ChainSucc = EdgeIter.first;
646           class ChainEdge *ChainEdge = EdgeIter.second;
647           // Ignore loop edges
648           if (ChainPred == ChainSucc)
649             continue;
650 
651           // Stop early if the combined chain violates the maximum allowed size
652           if (ChainPred->numBlocks() + ChainSucc->numBlocks() >= MaxChainSize)
653             continue;
654 
655           // Compute the gain of merging the two chains
656           MergeGainTy CurGain =
657               getBestMergeGain(ChainPred, ChainSucc, ChainEdge);
658           if (CurGain.score() <= EPS)
659             continue;
660 
661           if (BestGain < CurGain ||
662               (std::abs(CurGain.score() - BestGain.score()) < EPS &&
663                compareChainPairs(ChainPred, ChainSucc, BestChainPred,
664                                  BestChainSucc))) {
665             BestGain = CurGain;
666             BestChainPred = ChainPred;
667             BestChainSucc = ChainSucc;
668           }
669         }
670       }
671 
672       // Stop merging when there is no improvement
673       if (BestGain.score() <= EPS)
674         break;
675 
676       // Merge the best pair of chains
677       mergeChains(BestChainPred, BestChainSucc, BestGain.mergeOffset(),
678                   BestGain.mergeType());
679     }
680   }
681 
682   /// Merge remaining blocks into chains w/o taking jump counts into
683   /// consideration. This allows to maintain the original block order in the
684   /// absense of profile data
685   void mergeColdChains() {
686     for (size_t SrcBB = 0; SrcBB < NumNodes; SrcBB++) {
687       // Iterating in reverse order to make sure original fallthrough jumps are
688       // merged first; this might be beneficial for code size.
689       size_t NumSuccs = SuccNodes[SrcBB].size();
690       for (size_t Idx = 0; Idx < NumSuccs; Idx++) {
691         auto DstBB = SuccNodes[SrcBB][NumSuccs - Idx - 1];
692         auto SrcChain = AllBlocks[SrcBB].CurChain;
693         auto DstChain = AllBlocks[DstBB].CurChain;
694         if (SrcChain != DstChain && !DstChain->isEntry() &&
695             SrcChain->blocks().back()->Index == SrcBB &&
696             DstChain->blocks().front()->Index == DstBB &&
697             SrcChain->isCold() == DstChain->isCold()) {
698           mergeChains(SrcChain, DstChain, 0, MergeTypeTy::X_Y);
699         }
700       }
701     }
702   }
703 
704   /// Compute the Ext-TSP score for a given block order and a list of jumps.
705   double extTSPScore(const MergedChain &MergedBlocks,
706                      const std::vector<Jump *> &Jumps) const {
707     if (Jumps.empty())
708       return 0.0;
709     uint64_t CurAddr = 0;
710     MergedBlocks.forEach([&](const Block *BB) {
711       BB->EstimatedAddr = CurAddr;
712       CurAddr += BB->Size;
713     });
714 
715     double Score = 0;
716     for (auto &Jump : Jumps) {
717       const Block *SrcBlock = Jump->Source;
718       const Block *DstBlock = Jump->Target;
719       Score += ::extTSPScore(SrcBlock->EstimatedAddr, SrcBlock->Size,
720                              DstBlock->EstimatedAddr, Jump->ExecutionCount,
721                              Jump->IsConditional);
722     }
723     return Score;
724   }
725 
726   /// Compute the gain of merging two chains.
727   ///
728   /// The function considers all possible ways of merging two chains and
729   /// computes the one having the largest increase in ExtTSP objective. The
730   /// result is a pair with the first element being the gain and the second
731   /// element being the corresponding merging type.
732   MergeGainTy getBestMergeGain(Chain *ChainPred, Chain *ChainSucc,
733                                ChainEdge *Edge) const {
734     if (Edge->hasCachedMergeGain(ChainPred, ChainSucc)) {
735       return Edge->getCachedMergeGain(ChainPred, ChainSucc);
736     }
737 
738     // Precompute jumps between ChainPred and ChainSucc
739     auto Jumps = Edge->jumps();
740     ChainEdge *EdgePP = ChainPred->getEdge(ChainPred);
741     if (EdgePP != nullptr) {
742       Jumps.insert(Jumps.end(), EdgePP->jumps().begin(), EdgePP->jumps().end());
743     }
744     assert(!Jumps.empty() && "trying to merge chains w/o jumps");
745 
746     // The object holds the best currently chosen gain of merging the two chains
747     MergeGainTy Gain = MergeGainTy();
748 
749     /// Given a merge offset and a list of merge types, try to merge two chains
750     /// and update Gain with a better alternative
751     auto tryChainMerging = [&](size_t Offset,
752                                const std::vector<MergeTypeTy> &MergeTypes) {
753       // Skip merging corresponding to concatenation w/o splitting
754       if (Offset == 0 || Offset == ChainPred->blocks().size())
755         return;
756       // Skip merging if it breaks Forced successors
757       auto BB = ChainPred->blocks()[Offset - 1];
758       if (BB->ForcedSucc != nullptr)
759         return;
760       // Apply the merge, compute the corresponding gain, and update the best
761       // value, if the merge is beneficial
762       for (const auto &MergeType : MergeTypes) {
763         Gain.updateIfLessThan(
764             computeMergeGain(ChainPred, ChainSucc, Jumps, Offset, MergeType));
765       }
766     };
767 
768     // Try to concatenate two chains w/o splitting
769     Gain.updateIfLessThan(
770         computeMergeGain(ChainPred, ChainSucc, Jumps, 0, MergeTypeTy::X_Y));
771 
772     if (EnableChainSplitAlongJumps) {
773       // Attach (a part of) ChainPred before the first block of ChainSucc
774       for (auto &Jump : ChainSucc->blocks().front()->InJumps) {
775         const auto SrcBlock = Jump->Source;
776         if (SrcBlock->CurChain != ChainPred)
777           continue;
778         size_t Offset = SrcBlock->CurIndex + 1;
779         tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::X2_X1_Y});
780       }
781 
782       // Attach (a part of) ChainPred after the last block of ChainSucc
783       for (auto &Jump : ChainSucc->blocks().back()->OutJumps) {
784         const auto DstBlock = Jump->Source;
785         if (DstBlock->CurChain != ChainPred)
786           continue;
787         size_t Offset = DstBlock->CurIndex;
788         tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::Y_X2_X1});
789       }
790     }
791 
792     // Try to break ChainPred in various ways and concatenate with ChainSucc
793     if (ChainPred->blocks().size() <= ChainSplitThreshold) {
794       for (size_t Offset = 1; Offset < ChainPred->blocks().size(); Offset++) {
795         // Try to split the chain in different ways. In practice, applying
796         // X2_Y_X1 merging is almost never provides benefits; thus, we exclude
797         // it from consideration to reduce the search space
798         tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::Y_X2_X1,
799                                  MergeTypeTy::X2_X1_Y});
800       }
801     }
802     Edge->setCachedMergeGain(ChainPred, ChainSucc, Gain);
803     return Gain;
804   }
805 
806   /// Compute the score gain of merging two chains, respecting a given
807   /// merge 'type' and 'offset'.
808   ///
809   /// The two chains are not modified in the method.
810   MergeGainTy computeMergeGain(const Chain *ChainPred, const Chain *ChainSucc,
811                                const std::vector<Jump *> &Jumps,
812                                size_t MergeOffset,
813                                MergeTypeTy MergeType) const {
814     auto MergedBlocks = mergeBlocks(ChainPred->blocks(), ChainSucc->blocks(),
815                                     MergeOffset, MergeType);
816 
817     // Do not allow a merge that does not preserve the original entry block
818     if ((ChainPred->isEntry() || ChainSucc->isEntry()) &&
819         !MergedBlocks.getFirstBlock()->isEntry())
820       return MergeGainTy();
821 
822     // The gain for the new chain
823     auto NewGainScore = extTSPScore(MergedBlocks, Jumps) - ChainPred->score();
824     return MergeGainTy(NewGainScore, MergeOffset, MergeType);
825   }
826 
827   /// Merge two chains of blocks respecting a given merge 'type' and 'offset'.
828   ///
829   /// If MergeType == 0, then the result is a concatenation of two chains.
830   /// Otherwise, the first chain is cut into two sub-chains at the offset,
831   /// and merged using all possible ways of concatenating three chains.
832   MergedChain mergeBlocks(const std::vector<Block *> &X,
833                           const std::vector<Block *> &Y, size_t MergeOffset,
834                           MergeTypeTy MergeType) const {
835     // Split the first chain, X, into X1 and X2
836     BlockIter BeginX1 = X.begin();
837     BlockIter EndX1 = X.begin() + MergeOffset;
838     BlockIter BeginX2 = X.begin() + MergeOffset;
839     BlockIter EndX2 = X.end();
840     BlockIter BeginY = Y.begin();
841     BlockIter EndY = Y.end();
842 
843     // Construct a new chain from the three existing ones
844     switch (MergeType) {
845     case MergeTypeTy::X_Y:
846       return MergedChain(BeginX1, EndX2, BeginY, EndY);
847     case MergeTypeTy::X1_Y_X2:
848       return MergedChain(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2);
849     case MergeTypeTy::Y_X2_X1:
850       return MergedChain(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1);
851     case MergeTypeTy::X2_X1_Y:
852       return MergedChain(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY);
853     }
854     llvm_unreachable("unexpected chain merge type");
855   }
856 
857   /// Merge chain From into chain Into, update the list of active chains,
858   /// adjacency information, and the corresponding cached values.
859   void mergeChains(Chain *Into, Chain *From, size_t MergeOffset,
860                    MergeTypeTy MergeType) {
861     assert(Into != From && "a chain cannot be merged with itself");
862 
863     // Merge the blocks
864     MergedChain MergedBlocks =
865         mergeBlocks(Into->blocks(), From->blocks(), MergeOffset, MergeType);
866     Into->merge(From, MergedBlocks.getBlocks());
867     Into->mergeEdges(From);
868     From->clear();
869 
870     // Update cached ext-tsp score for the new chain
871     ChainEdge *SelfEdge = Into->getEdge(Into);
872     if (SelfEdge != nullptr) {
873       MergedBlocks = MergedChain(Into->blocks().begin(), Into->blocks().end());
874       Into->setScore(extTSPScore(MergedBlocks, SelfEdge->jumps()));
875     }
876 
877     // Remove chain From from the list of active chains
878     llvm::erase_value(HotChains, From);
879 
880     // Invalidate caches
881     for (auto EdgeIter : Into->edges()) {
882       EdgeIter.second->invalidateCache();
883     }
884   }
885 
886   /// Concatenate all chains into a final order of blocks.
887   void concatChains(std::vector<uint64_t> &Order) {
888     // Collect chains and calculate some stats for their sorting
889     std::vector<Chain *> SortedChains;
890     DenseMap<const Chain *, double> ChainDensity;
891     for (auto &Chain : AllChains) {
892       if (!Chain.blocks().empty()) {
893         SortedChains.push_back(&Chain);
894         // Using doubles to avoid overflow of ExecutionCount
895         double Size = 0;
896         double ExecutionCount = 0;
897         for (auto *Block : Chain.blocks()) {
898           Size += static_cast<double>(Block->Size);
899           ExecutionCount += static_cast<double>(Block->ExecutionCount);
900         }
901         assert(Size > 0 && "a chain of zero size");
902         ChainDensity[&Chain] = ExecutionCount / Size;
903       }
904     }
905 
906     // Sorting chains by density in the decreasing order
907     std::stable_sort(SortedChains.begin(), SortedChains.end(),
908                      [&](const Chain *C1, const Chain *C2) {
909                        // Make sure the original entry block is at the
910                        // beginning of the order
911                        if (C1->isEntry() != C2->isEntry()) {
912                          return C1->isEntry();
913                        }
914 
915                        const double D1 = ChainDensity[C1];
916                        const double D2 = ChainDensity[C2];
917                        // Compare by density and break ties by chain identifiers
918                        return (D1 != D2) ? (D1 > D2) : (C1->id() < C2->id());
919                      });
920 
921     // Collect the blocks in the order specified by their chains
922     Order.reserve(NumNodes);
923     for (Chain *Chain : SortedChains) {
924       for (Block *Block : Chain->blocks()) {
925         Order.push_back(Block->Index);
926       }
927     }
928   }
929 
930 private:
931   /// The number of nodes in the graph.
932   const size_t NumNodes;
933 
934   /// Successors of each node.
935   std::vector<std::vector<uint64_t>> SuccNodes;
936 
937   /// Predecessors of each node.
938   std::vector<std::vector<uint64_t>> PredNodes;
939 
940   /// All basic blocks.
941   std::vector<Block> AllBlocks;
942 
943   /// All jumps between blocks.
944   std::vector<Jump> AllJumps;
945 
946   /// All chains of basic blocks.
947   std::vector<Chain> AllChains;
948 
949   /// All edges between chains.
950   std::vector<ChainEdge> AllEdges;
951 
952   /// Active chains. The vector gets updated at runtime when chains are merged.
953   std::vector<Chain *> HotChains;
954 };
955 
956 } // end of anonymous namespace
957 
958 std::vector<uint64_t> llvm::applyExtTspLayout(
959     const std::vector<uint64_t> &NodeSizes,
960     const std::vector<uint64_t> &NodeCounts,
961     const std::vector<std::pair<EdgeT, uint64_t>> &EdgeCounts) {
962   size_t NumNodes = NodeSizes.size();
963 
964   // Verify correctness of the input data.
965   assert(NodeCounts.size() == NodeSizes.size() && "Incorrect input");
966   assert(NumNodes > 2 && "Incorrect input");
967 
968   // Apply the reordering algorithm.
969   auto Alg = ExtTSPImpl(NumNodes, NodeSizes, NodeCounts, EdgeCounts);
970   std::vector<uint64_t> Result;
971   Alg.run(Result);
972 
973   // Verify correctness of the output.
974   assert(Result.front() == 0 && "Original entry point is not preserved");
975   assert(Result.size() == NumNodes && "Incorrect size of reordered layout");
976   return Result;
977 }
978 
979 double llvm::calcExtTspScore(
980     const std::vector<uint64_t> &Order, const std::vector<uint64_t> &NodeSizes,
981     const std::vector<uint64_t> &NodeCounts,
982     const std::vector<std::pair<EdgeT, uint64_t>> &EdgeCounts) {
983   // Estimate addresses of the blocks in memory
984   std::vector<uint64_t> Addr(NodeSizes.size(), 0);
985   for (size_t Idx = 1; Idx < Order.size(); Idx++) {
986     Addr[Order[Idx]] = Addr[Order[Idx - 1]] + NodeSizes[Order[Idx - 1]];
987   }
988   std::vector<uint64_t> OutDegree(NodeSizes.size(), 0);
989   for (auto It : EdgeCounts) {
990     auto Pred = It.first.first;
991     OutDegree[Pred]++;
992   }
993 
994   // Increase the score for each jump
995   double Score = 0;
996   for (auto It : EdgeCounts) {
997     auto Pred = It.first.first;
998     auto Succ = It.first.second;
999     uint64_t Count = It.second;
1000     bool IsConditional = OutDegree[Pred] > 1;
1001     Score += ::extTSPScore(Addr[Pred], NodeSizes[Pred], Addr[Succ], Count,
1002                            IsConditional);
1003   }
1004   return Score;
1005 }
1006 
1007 double llvm::calcExtTspScore(
1008     const std::vector<uint64_t> &NodeSizes,
1009     const std::vector<uint64_t> &NodeCounts,
1010     const std::vector<std::pair<EdgeT, uint64_t>> &EdgeCounts) {
1011   std::vector<uint64_t> Order(NodeSizes.size());
1012   for (size_t Idx = 0; Idx < NodeSizes.size(); Idx++) {
1013     Order[Idx] = Idx;
1014   }
1015   return calcExtTspScore(Order, NodeSizes, NodeCounts, EdgeCounts);
1016 }
1017