1 //===- CodeLayout.cpp - Implementation of code layout algorithms ----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // The file implements "cache-aware" layout algorithms of basic blocks and 10 // functions in a binary. 11 // 12 // The algorithm tries to find a layout of nodes (basic blocks) of a given CFG 13 // optimizing jump locality and thus processor I-cache utilization. This is 14 // achieved via increasing the number of fall-through jumps and co-locating 15 // frequently executed nodes together. The name follows the underlying 16 // optimization problem, Extended-TSP, which is a generalization of classical 17 // (maximum) Traveling Salesmen Problem. 18 // 19 // The algorithm is a greedy heuristic that works with chains (ordered lists) 20 // of basic blocks. Initially all chains are isolated basic blocks. On every 21 // iteration, we pick a pair of chains whose merging yields the biggest increase 22 // in the ExtTSP score, which models how i-cache "friendly" a specific chain is. 23 // A pair of chains giving the maximum gain is merged into a new chain. The 24 // procedure stops when there is only one chain left, or when merging does not 25 // increase ExtTSP. In the latter case, the remaining chains are sorted by 26 // density in the decreasing order. 27 // 28 // An important aspect is the way two chains are merged. Unlike earlier 29 // algorithms (e.g., based on the approach of Pettis-Hansen), two 30 // chains, X and Y, are first split into three, X1, X2, and Y. Then we 31 // consider all possible ways of gluing the three chains (e.g., X1YX2, X1X2Y, 32 // X2X1Y, X2YX1, YX1X2, YX2X1) and choose the one producing the largest score. 33 // This improves the quality of the final result (the search space is larger) 34 // while keeping the implementation sufficiently fast. 35 // 36 // Reference: 37 // * A. Newell and S. Pupyrev, Improved Basic Block Reordering, 38 // IEEE Transactions on Computers, 2020 39 // https://arxiv.org/abs/1809.04676 40 // 41 //===----------------------------------------------------------------------===// 42 43 #include "llvm/Transforms/Utils/CodeLayout.h" 44 #include "llvm/Support/CommandLine.h" 45 #include "llvm/Support/Debug.h" 46 47 #include <cmath> 48 #include <set> 49 50 using namespace llvm; 51 using namespace llvm::codelayout; 52 53 #define DEBUG_TYPE "code-layout" 54 55 namespace llvm { 56 cl::opt<bool> EnableExtTspBlockPlacement( 57 "enable-ext-tsp-block-placement", cl::Hidden, cl::init(false), 58 cl::desc("Enable machine block placement based on the ext-tsp model, " 59 "optimizing I-cache utilization.")); 60 61 cl::opt<bool> ApplyExtTspWithoutProfile( 62 "ext-tsp-apply-without-profile", 63 cl::desc("Whether to apply ext-tsp placement for instances w/o profile"), 64 cl::init(true), cl::Hidden); 65 } // namespace llvm 66 67 // Algorithm-specific params for Ext-TSP. The values are tuned for the best 68 // performance of large-scale front-end bound binaries. 69 static cl::opt<double> ForwardWeightCond( 70 "ext-tsp-forward-weight-cond", cl::ReallyHidden, cl::init(0.1), 71 cl::desc("The weight of conditional forward jumps for ExtTSP value")); 72 73 static cl::opt<double> ForwardWeightUncond( 74 "ext-tsp-forward-weight-uncond", cl::ReallyHidden, cl::init(0.1), 75 cl::desc("The weight of unconditional forward jumps for ExtTSP value")); 76 77 static cl::opt<double> BackwardWeightCond( 78 "ext-tsp-backward-weight-cond", cl::ReallyHidden, cl::init(0.1), 79 cl::desc("The weight of conditional backward jumps for ExtTSP value")); 80 81 static cl::opt<double> BackwardWeightUncond( 82 "ext-tsp-backward-weight-uncond", cl::ReallyHidden, cl::init(0.1), 83 cl::desc("The weight of unconditional backward jumps for ExtTSP value")); 84 85 static cl::opt<double> FallthroughWeightCond( 86 "ext-tsp-fallthrough-weight-cond", cl::ReallyHidden, cl::init(1.0), 87 cl::desc("The weight of conditional fallthrough jumps for ExtTSP value")); 88 89 static cl::opt<double> FallthroughWeightUncond( 90 "ext-tsp-fallthrough-weight-uncond", cl::ReallyHidden, cl::init(1.05), 91 cl::desc("The weight of unconditional fallthrough jumps for ExtTSP value")); 92 93 static cl::opt<unsigned> ForwardDistance( 94 "ext-tsp-forward-distance", cl::ReallyHidden, cl::init(1024), 95 cl::desc("The maximum distance (in bytes) of a forward jump for ExtTSP")); 96 97 static cl::opt<unsigned> BackwardDistance( 98 "ext-tsp-backward-distance", cl::ReallyHidden, cl::init(640), 99 cl::desc("The maximum distance (in bytes) of a backward jump for ExtTSP")); 100 101 // The maximum size of a chain created by the algorithm. The size is bounded 102 // so that the algorithm can efficiently process extremely large instances. 103 static cl::opt<unsigned> 104 MaxChainSize("ext-tsp-max-chain-size", cl::ReallyHidden, cl::init(512), 105 cl::desc("The maximum size of a chain to create")); 106 107 // The maximum ratio between densities of two chains for merging. 108 static cl::opt<double> MaxMergeDensityRatio( 109 "ext-tsp-max-merge-density-ratio", cl::ReallyHidden, cl::init(128), 110 cl::desc("The maximum ratio between densities of two chains for merging")); 111 112 // Algorithm-specific options for CDS. 113 static cl::opt<unsigned> CacheEntries("cds-cache-entries", cl::ReallyHidden, 114 cl::desc("The size of the cache")); 115 116 static cl::opt<unsigned> CacheSize("cds-cache-size", cl::ReallyHidden, 117 cl::desc("The size of a line in the cache")); 118 119 static cl::opt<double> DistancePower( 120 "cds-distance-power", cl::ReallyHidden, 121 cl::desc("The power exponent for the distance-based locality")); 122 123 static cl::opt<double> FrequencyScale( 124 "cds-frequency-scale", cl::ReallyHidden, 125 cl::desc("The scale factor for the frequency-based locality")); 126 127 namespace { 128 129 // Epsilon for comparison of doubles. 130 constexpr double EPS = 1e-8; 131 132 // Compute the Ext-TSP score for a given jump. 133 double jumpExtTSPScore(uint64_t JumpDist, uint64_t JumpMaxDist, uint64_t Count, 134 double Weight) { 135 if (JumpDist > JumpMaxDist) 136 return 0; 137 double Prob = 1.0 - static_cast<double>(JumpDist) / JumpMaxDist; 138 return Weight * Prob * Count; 139 } 140 141 // Compute the Ext-TSP score for a jump between a given pair of blocks, 142 // using their sizes, (estimated) addresses and the jump execution count. 143 double extTSPScore(uint64_t SrcAddr, uint64_t SrcSize, uint64_t DstAddr, 144 uint64_t Count, bool IsConditional) { 145 // Fallthrough 146 if (SrcAddr + SrcSize == DstAddr) { 147 return jumpExtTSPScore(0, 1, Count, 148 IsConditional ? FallthroughWeightCond 149 : FallthroughWeightUncond); 150 } 151 // Forward 152 if (SrcAddr + SrcSize < DstAddr) { 153 const uint64_t Dist = DstAddr - (SrcAddr + SrcSize); 154 return jumpExtTSPScore(Dist, ForwardDistance, Count, 155 IsConditional ? ForwardWeightCond 156 : ForwardWeightUncond); 157 } 158 // Backward 159 const uint64_t Dist = SrcAddr + SrcSize - DstAddr; 160 return jumpExtTSPScore(Dist, BackwardDistance, Count, 161 IsConditional ? BackwardWeightCond 162 : BackwardWeightUncond); 163 } 164 165 /// A type of merging two chains, X and Y. The former chain is split into 166 /// X1 and X2 and then concatenated with Y in the order specified by the type. 167 enum class MergeTypeT : int { X_Y, Y_X, X1_Y_X2, Y_X2_X1, X2_X1_Y }; 168 169 /// The gain of merging two chains, that is, the Ext-TSP score of the merge 170 /// together with the corresponding merge 'type' and 'offset'. 171 struct MergeGainT { 172 explicit MergeGainT() = default; 173 explicit MergeGainT(double Score, size_t MergeOffset, MergeTypeT MergeType) 174 : Score(Score), MergeOffset(MergeOffset), MergeType(MergeType) {} 175 176 double score() const { return Score; } 177 178 size_t mergeOffset() const { return MergeOffset; } 179 180 MergeTypeT mergeType() const { return MergeType; } 181 182 void setMergeType(MergeTypeT Ty) { MergeType = Ty; } 183 184 // Returns 'true' iff Other is preferred over this. 185 bool operator<(const MergeGainT &Other) const { 186 return (Other.Score > EPS && Other.Score > Score + EPS); 187 } 188 189 // Update the current gain if Other is preferred over this. 190 void updateIfLessThan(const MergeGainT &Other) { 191 if (*this < Other) 192 *this = Other; 193 } 194 195 private: 196 double Score{-1.0}; 197 size_t MergeOffset{0}; 198 MergeTypeT MergeType{MergeTypeT::X_Y}; 199 }; 200 201 struct JumpT; 202 struct ChainT; 203 struct ChainEdge; 204 205 /// A node in the graph, typically corresponding to a basic block in the CFG or 206 /// a function in the call graph. 207 struct NodeT { 208 NodeT(const NodeT &) = delete; 209 NodeT(NodeT &&) = default; 210 NodeT &operator=(const NodeT &) = delete; 211 NodeT &operator=(NodeT &&) = default; 212 213 explicit NodeT(size_t Index, uint64_t Size, uint64_t Count) 214 : Index(Index), Size(Size), ExecutionCount(Count) {} 215 216 bool isEntry() const { return Index == 0; } 217 218 // Check if Other is a successor of the node. 219 bool isSuccessor(const NodeT *Other) const; 220 // The total execution count of outgoing jumps. 221 uint64_t outCount() const; 222 223 // The total execution count of incoming jumps. 224 uint64_t inCount() const; 225 226 // The original index of the node in graph. 227 size_t Index{0}; 228 // The index of the node in the current chain. 229 size_t CurIndex{0}; 230 // The size of the node in the binary. 231 uint64_t Size{0}; 232 // The execution count of the node in the profile data. 233 uint64_t ExecutionCount{0}; 234 // The current chain of the node. 235 ChainT *CurChain{nullptr}; 236 // The offset of the node in the current chain. 237 mutable uint64_t EstimatedAddr{0}; 238 // Forced successor of the node in the graph. 239 NodeT *ForcedSucc{nullptr}; 240 // Forced predecessor of the node in the graph. 241 NodeT *ForcedPred{nullptr}; 242 // Outgoing jumps from the node. 243 std::vector<JumpT *> OutJumps; 244 // Incoming jumps to the node. 245 std::vector<JumpT *> InJumps; 246 }; 247 248 /// An arc in the graph, typically corresponding to a jump between two nodes. 249 struct JumpT { 250 JumpT(const JumpT &) = delete; 251 JumpT(JumpT &&) = default; 252 JumpT &operator=(const JumpT &) = delete; 253 JumpT &operator=(JumpT &&) = default; 254 255 explicit JumpT(NodeT *Source, NodeT *Target, uint64_t ExecutionCount) 256 : Source(Source), Target(Target), ExecutionCount(ExecutionCount) {} 257 258 // Source node of the jump. 259 NodeT *Source; 260 // Target node of the jump. 261 NodeT *Target; 262 // Execution count of the arc in the profile data. 263 uint64_t ExecutionCount{0}; 264 // Whether the jump corresponds to a conditional branch. 265 bool IsConditional{false}; 266 // The offset of the jump from the source node. 267 uint64_t Offset{0}; 268 }; 269 270 /// A chain (ordered sequence) of nodes in the graph. 271 struct ChainT { 272 ChainT(const ChainT &) = delete; 273 ChainT(ChainT &&) = default; 274 ChainT &operator=(const ChainT &) = delete; 275 ChainT &operator=(ChainT &&) = default; 276 277 explicit ChainT(uint64_t Id, NodeT *Node) 278 : Id(Id), ExecutionCount(Node->ExecutionCount), Size(Node->Size), 279 Nodes(1, Node) {} 280 281 size_t numBlocks() const { return Nodes.size(); } 282 283 double density() const { return static_cast<double>(ExecutionCount) / Size; } 284 285 bool isEntry() const { return Nodes[0]->Index == 0; } 286 287 bool isCold() const { 288 for (NodeT *Node : Nodes) { 289 if (Node->ExecutionCount > 0) 290 return false; 291 } 292 return true; 293 } 294 295 ChainEdge *getEdge(ChainT *Other) const { 296 for (const auto &[Chain, ChainEdge] : Edges) { 297 if (Chain == Other) 298 return ChainEdge; 299 } 300 return nullptr; 301 } 302 303 void removeEdge(ChainT *Other) { 304 auto It = Edges.begin(); 305 while (It != Edges.end()) { 306 if (It->first == Other) { 307 Edges.erase(It); 308 return; 309 } 310 It++; 311 } 312 } 313 314 void addEdge(ChainT *Other, ChainEdge *Edge) { 315 Edges.push_back(std::make_pair(Other, Edge)); 316 } 317 318 void merge(ChainT *Other, std::vector<NodeT *> MergedBlocks) { 319 Nodes = std::move(MergedBlocks); 320 // Update the chain's data. 321 ExecutionCount += Other->ExecutionCount; 322 Size += Other->Size; 323 Id = Nodes[0]->Index; 324 // Update the node's data. 325 for (size_t Idx = 0; Idx < Nodes.size(); Idx++) { 326 Nodes[Idx]->CurChain = this; 327 Nodes[Idx]->CurIndex = Idx; 328 } 329 } 330 331 void mergeEdges(ChainT *Other); 332 333 void clear() { 334 Nodes.clear(); 335 Nodes.shrink_to_fit(); 336 Edges.clear(); 337 Edges.shrink_to_fit(); 338 } 339 340 // Unique chain identifier. 341 uint64_t Id; 342 // Cached ext-tsp score for the chain. 343 double Score{0}; 344 // The total execution count of the chain. 345 uint64_t ExecutionCount{0}; 346 // The total size of the chain. 347 uint64_t Size{0}; 348 // Nodes of the chain. 349 std::vector<NodeT *> Nodes; 350 // Adjacent chains and corresponding edges (lists of jumps). 351 std::vector<std::pair<ChainT *, ChainEdge *>> Edges; 352 }; 353 354 /// An edge in the graph representing jumps between two chains. 355 /// When nodes are merged into chains, the edges are combined too so that 356 /// there is always at most one edge between a pair of chains. 357 struct ChainEdge { 358 ChainEdge(const ChainEdge &) = delete; 359 ChainEdge(ChainEdge &&) = default; 360 ChainEdge &operator=(const ChainEdge &) = delete; 361 ChainEdge &operator=(ChainEdge &&) = delete; 362 363 explicit ChainEdge(JumpT *Jump) 364 : SrcChain(Jump->Source->CurChain), DstChain(Jump->Target->CurChain), 365 Jumps(1, Jump) {} 366 367 ChainT *srcChain() const { return SrcChain; } 368 369 ChainT *dstChain() const { return DstChain; } 370 371 bool isSelfEdge() const { return SrcChain == DstChain; } 372 373 const std::vector<JumpT *> &jumps() const { return Jumps; } 374 375 void appendJump(JumpT *Jump) { Jumps.push_back(Jump); } 376 377 void moveJumps(ChainEdge *Other) { 378 Jumps.insert(Jumps.end(), Other->Jumps.begin(), Other->Jumps.end()); 379 Other->Jumps.clear(); 380 Other->Jumps.shrink_to_fit(); 381 } 382 383 void changeEndpoint(ChainT *From, ChainT *To) { 384 if (From == SrcChain) 385 SrcChain = To; 386 if (From == DstChain) 387 DstChain = To; 388 } 389 390 bool hasCachedMergeGain(ChainT *Src, ChainT *Dst) const { 391 return Src == SrcChain ? CacheValidForward : CacheValidBackward; 392 } 393 394 MergeGainT getCachedMergeGain(ChainT *Src, ChainT *Dst) const { 395 return Src == SrcChain ? CachedGainForward : CachedGainBackward; 396 } 397 398 void setCachedMergeGain(ChainT *Src, ChainT *Dst, MergeGainT MergeGain) { 399 if (Src == SrcChain) { 400 CachedGainForward = MergeGain; 401 CacheValidForward = true; 402 } else { 403 CachedGainBackward = MergeGain; 404 CacheValidBackward = true; 405 } 406 } 407 408 void invalidateCache() { 409 CacheValidForward = false; 410 CacheValidBackward = false; 411 } 412 413 void setMergeGain(MergeGainT Gain) { CachedGain = Gain; } 414 415 MergeGainT getMergeGain() const { return CachedGain; } 416 417 double gain() const { return CachedGain.score(); } 418 419 private: 420 // Source chain. 421 ChainT *SrcChain{nullptr}; 422 // Destination chain. 423 ChainT *DstChain{nullptr}; 424 // Original jumps in the binary with corresponding execution counts. 425 std::vector<JumpT *> Jumps; 426 // Cached gain value for merging the pair of chains. 427 MergeGainT CachedGain; 428 429 // Cached gain values for merging the pair of chains. Since the gain of 430 // merging (Src, Dst) and (Dst, Src) might be different, we store both values 431 // here and a flag indicating which of the options results in a higher gain. 432 // Cached gain values. 433 MergeGainT CachedGainForward; 434 MergeGainT CachedGainBackward; 435 // Whether the cached value must be recomputed. 436 bool CacheValidForward{false}; 437 bool CacheValidBackward{false}; 438 }; 439 440 bool NodeT::isSuccessor(const NodeT *Other) const { 441 for (JumpT *Jump : OutJumps) { 442 if (Jump->Target == Other) 443 return true; 444 } 445 return false; 446 } 447 448 uint64_t NodeT::outCount() const { 449 uint64_t Count = 0; 450 for (JumpT *Jump : OutJumps) 451 Count += Jump->ExecutionCount; 452 return Count; 453 } 454 455 uint64_t NodeT::inCount() const { 456 uint64_t Count = 0; 457 for (JumpT *Jump : InJumps) 458 Count += Jump->ExecutionCount; 459 return Count; 460 } 461 462 void ChainT::mergeEdges(ChainT *Other) { 463 // Update edges adjacent to chain Other. 464 for (const auto &[DstChain, DstEdge] : Other->Edges) { 465 ChainT *TargetChain = DstChain == Other ? this : DstChain; 466 ChainEdge *CurEdge = getEdge(TargetChain); 467 if (CurEdge == nullptr) { 468 DstEdge->changeEndpoint(Other, this); 469 this->addEdge(TargetChain, DstEdge); 470 if (DstChain != this && DstChain != Other) 471 DstChain->addEdge(this, DstEdge); 472 } else { 473 CurEdge->moveJumps(DstEdge); 474 } 475 // Cleanup leftover edge. 476 if (DstChain != Other) 477 DstChain->removeEdge(Other); 478 } 479 } 480 481 /// A wrapper around three concatenated vectors (chains) of objects; it is used 482 /// to avoid extra instantiation of the vectors. 483 template <typename ObjType> struct MergedVector { 484 using ObjIter = typename std::vector<ObjType *>::const_iterator; 485 486 MergedVector(ObjIter Begin1, ObjIter End1, ObjIter Begin2 = ObjIter(), 487 ObjIter End2 = ObjIter(), ObjIter Begin3 = ObjIter(), 488 ObjIter End3 = ObjIter()) 489 : Begin1(Begin1), End1(End1), Begin2(Begin2), End2(End2), Begin3(Begin3), 490 End3(End3) {} 491 492 template <typename F> void forEach(const F &Func) const { 493 for (auto It = Begin1; It != End1; It++) 494 Func(*It); 495 for (auto It = Begin2; It != End2; It++) 496 Func(*It); 497 for (auto It = Begin3; It != End3; It++) 498 Func(*It); 499 } 500 501 std::vector<NodeT *> getNodes() const { 502 std::vector<NodeT *> Result; 503 Result.reserve(std::distance(Begin1, End1) + std::distance(Begin2, End2) + 504 std::distance(Begin3, End3)); 505 Result.insert(Result.end(), Begin1, End1); 506 Result.insert(Result.end(), Begin2, End2); 507 Result.insert(Result.end(), Begin3, End3); 508 return Result; 509 } 510 511 const NodeT *getFirstNode() const { return *Begin1; } 512 513 bool empty() const { return Begin1 == End1; } 514 515 void append(ObjIter Begin, ObjIter End) { 516 if (Begin2 == End2) { 517 Begin2 = Begin; 518 End2 = End; 519 return; 520 } 521 assert(Begin3 == End3 && "cannot extend MergedVector"); 522 Begin3 = Begin; 523 End3 = End; 524 } 525 526 private: 527 ObjIter Begin1; 528 ObjIter End1; 529 ObjIter Begin2; 530 ObjIter End2; 531 ObjIter Begin3; 532 ObjIter End3; 533 }; 534 535 /// Merge two chains of nodes respecting a given 'type' and 'offset'. 536 /// 537 /// If MergeType == 0, then the result is a concatenation of two chains. 538 /// Otherwise, the first chain is cut into two sub-chains at the offset, 539 /// and merged using all possible ways of concatenating three chains. 540 MergedVector<NodeT> mergeNodes(const std::vector<NodeT *> &X, 541 const std::vector<NodeT *> &Y, 542 size_t MergeOffset, MergeTypeT MergeType) { 543 // Split the first chain, X, into X1 and X2. 544 MergedVector<NodeT>::ObjIter BeginX1 = X.begin(); 545 MergedVector<NodeT>::ObjIter EndX1 = X.begin() + MergeOffset; 546 MergedVector<NodeT>::ObjIter BeginX2 = X.begin() + MergeOffset; 547 MergedVector<NodeT>::ObjIter EndX2 = X.end(); 548 MergedVector<NodeT>::ObjIter BeginY = Y.begin(); 549 MergedVector<NodeT>::ObjIter EndY = Y.end(); 550 551 // Construct a new chain from the three existing ones. 552 switch (MergeType) { 553 case MergeTypeT::X_Y: 554 return MergedVector<NodeT>(BeginX1, EndX2, BeginY, EndY); 555 case MergeTypeT::Y_X: 556 return MergedVector<NodeT>(BeginY, EndY, BeginX1, EndX2); 557 case MergeTypeT::X1_Y_X2: 558 return MergedVector<NodeT>(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2); 559 case MergeTypeT::Y_X2_X1: 560 return MergedVector<NodeT>(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1); 561 case MergeTypeT::X2_X1_Y: 562 return MergedVector<NodeT>(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY); 563 } 564 llvm_unreachable("unexpected chain merge type"); 565 } 566 567 /// The implementation of the ExtTSP algorithm. 568 class ExtTSPImpl { 569 public: 570 ExtTSPImpl(ArrayRef<uint64_t> NodeSizes, ArrayRef<uint64_t> NodeCounts, 571 ArrayRef<EdgeCount> EdgeCounts) 572 : NumNodes(NodeSizes.size()) { 573 initialize(NodeSizes, NodeCounts, EdgeCounts); 574 } 575 576 /// Run the algorithm and return an optimized ordering of nodes. 577 std::vector<uint64_t> run() { 578 // Pass 1: Merge nodes with their mutually forced successors 579 mergeForcedPairs(); 580 581 // Pass 2: Merge pairs of chains while improving the ExtTSP objective 582 mergeChainPairs(); 583 584 // Pass 3: Merge cold nodes to reduce code size 585 mergeColdChains(); 586 587 // Collect nodes from all chains 588 return concatChains(); 589 } 590 591 private: 592 /// Initialize the algorithm's data structures. 593 void initialize(const ArrayRef<uint64_t> &NodeSizes, 594 const ArrayRef<uint64_t> &NodeCounts, 595 const ArrayRef<EdgeCount> &EdgeCounts) { 596 // Initialize nodes 597 AllNodes.reserve(NumNodes); 598 for (uint64_t Idx = 0; Idx < NumNodes; Idx++) { 599 uint64_t Size = std::max<uint64_t>(NodeSizes[Idx], 1ULL); 600 uint64_t ExecutionCount = NodeCounts[Idx]; 601 // The execution count of the entry node is set to at least one. 602 if (Idx == 0 && ExecutionCount == 0) 603 ExecutionCount = 1; 604 AllNodes.emplace_back(Idx, Size, ExecutionCount); 605 } 606 607 // Initialize jumps between nodes 608 SuccNodes.resize(NumNodes); 609 PredNodes.resize(NumNodes); 610 std::vector<uint64_t> OutDegree(NumNodes, 0); 611 AllJumps.reserve(EdgeCounts.size()); 612 for (auto Edge : EdgeCounts) { 613 ++OutDegree[Edge.src]; 614 // Ignore self-edges. 615 if (Edge.src == Edge.dst) 616 continue; 617 618 SuccNodes[Edge.src].push_back(Edge.dst); 619 PredNodes[Edge.dst].push_back(Edge.src); 620 if (Edge.count > 0) { 621 NodeT &PredNode = AllNodes[Edge.src]; 622 NodeT &SuccNode = AllNodes[Edge.dst]; 623 AllJumps.emplace_back(&PredNode, &SuccNode, Edge.count); 624 SuccNode.InJumps.push_back(&AllJumps.back()); 625 PredNode.OutJumps.push_back(&AllJumps.back()); 626 } 627 } 628 for (JumpT &Jump : AllJumps) { 629 assert(OutDegree[Jump.Source->Index] > 0); 630 Jump.IsConditional = OutDegree[Jump.Source->Index] > 1; 631 } 632 633 // Initialize chains. 634 AllChains.reserve(NumNodes); 635 HotChains.reserve(NumNodes); 636 for (NodeT &Node : AllNodes) { 637 // Adjust execution counts. 638 Node.ExecutionCount = std::max(Node.ExecutionCount, Node.inCount()); 639 Node.ExecutionCount = std::max(Node.ExecutionCount, Node.outCount()); 640 // Create a chain. 641 AllChains.emplace_back(Node.Index, &Node); 642 Node.CurChain = &AllChains.back(); 643 if (Node.ExecutionCount > 0) 644 HotChains.push_back(&AllChains.back()); 645 } 646 647 // Initialize chain edges. 648 AllEdges.reserve(AllJumps.size()); 649 for (NodeT &PredNode : AllNodes) { 650 for (JumpT *Jump : PredNode.OutJumps) { 651 NodeT *SuccNode = Jump->Target; 652 ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain); 653 // This edge is already present in the graph. 654 if (CurEdge != nullptr) { 655 assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr); 656 CurEdge->appendJump(Jump); 657 continue; 658 } 659 // This is a new edge. 660 AllEdges.emplace_back(Jump); 661 PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back()); 662 SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back()); 663 } 664 } 665 } 666 667 /// For a pair of nodes, A and B, node B is the forced successor of A, 668 /// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps 669 /// to B are from A. Such nodes should be adjacent in the optimal ordering; 670 /// the method finds and merges such pairs of nodes. 671 void mergeForcedPairs() { 672 // Find forced pairs of blocks. 673 for (NodeT &Node : AllNodes) { 674 if (SuccNodes[Node.Index].size() == 1 && 675 PredNodes[SuccNodes[Node.Index][0]].size() == 1 && 676 SuccNodes[Node.Index][0] != 0) { 677 size_t SuccIndex = SuccNodes[Node.Index][0]; 678 Node.ForcedSucc = &AllNodes[SuccIndex]; 679 AllNodes[SuccIndex].ForcedPred = &Node; 680 } 681 } 682 683 // There might be 'cycles' in the forced dependencies, since profile 684 // data isn't 100% accurate. Typically this is observed in loops, when the 685 // loop edges are the hottest successors for the basic blocks of the loop. 686 // Break the cycles by choosing the node with the smallest index as the 687 // head. This helps to keep the original order of the loops, which likely 688 // have already been rotated in the optimized manner. 689 for (NodeT &Node : AllNodes) { 690 if (Node.ForcedSucc == nullptr || Node.ForcedPred == nullptr) 691 continue; 692 693 NodeT *SuccNode = Node.ForcedSucc; 694 while (SuccNode != nullptr && SuccNode != &Node) { 695 SuccNode = SuccNode->ForcedSucc; 696 } 697 if (SuccNode == nullptr) 698 continue; 699 // Break the cycle. 700 AllNodes[Node.ForcedPred->Index].ForcedSucc = nullptr; 701 Node.ForcedPred = nullptr; 702 } 703 704 // Merge nodes with their fallthrough successors. 705 for (NodeT &Node : AllNodes) { 706 if (Node.ForcedPred == nullptr && Node.ForcedSucc != nullptr) { 707 const NodeT *CurBlock = &Node; 708 while (CurBlock->ForcedSucc != nullptr) { 709 const NodeT *NextBlock = CurBlock->ForcedSucc; 710 mergeChains(Node.CurChain, NextBlock->CurChain, 0, MergeTypeT::X_Y); 711 CurBlock = NextBlock; 712 } 713 } 714 } 715 } 716 717 /// Merge pairs of chains while improving the ExtTSP objective. 718 void mergeChainPairs() { 719 /// Deterministically compare pairs of chains. 720 auto compareChainPairs = [](const ChainT *A1, const ChainT *B1, 721 const ChainT *A2, const ChainT *B2) { 722 return std::make_tuple(A1->Id, B1->Id) < std::make_tuple(A2->Id, B2->Id); 723 }; 724 725 double PrevScore = 1e9; 726 while (HotChains.size() > 1) { 727 ChainT *BestChainPred = nullptr; 728 ChainT *BestChainSucc = nullptr; 729 MergeGainT BestGain; 730 // Iterate over all pairs of chains. 731 for (ChainT *ChainPred : HotChains) { 732 // Since the score of merging doesn't increase, we can stop early when 733 // the newly found merge is as good as the previous one. 734 if (BestGain.score() == PrevScore) 735 break; 736 // Get candidates for merging with the current chain. 737 for (const auto &[ChainSucc, Edge] : ChainPred->Edges) { 738 // Ignore loop edges. 739 if (Edge->isSelfEdge()) 740 continue; 741 // Stop early if the combined chain violates the maximum allowed size. 742 if (ChainPred->numBlocks() + ChainSucc->numBlocks() >= MaxChainSize) 743 continue; 744 // Don't merge the chains if they have vastly different densities. 745 // We stop early if the ratio between the densities exceeds 746 // MaxMergeDensityRatio. Smaller values of the option result in 747 // fewer merges (hence, more chains), which in turn typically yields 748 // smaller size of the hot code section. 749 double minDensity = 750 std::min(ChainPred->density(), ChainSucc->density()); 751 double maxDensity = 752 std::max(ChainPred->density(), ChainSucc->density()); 753 assert(minDensity > 0.0 && maxDensity > 0.0 && 754 "incorrectly computed chain densities"); 755 const double Ratio = maxDensity / minDensity; 756 if (Ratio > MaxMergeDensityRatio) 757 continue; 758 759 // Compute the gain of merging the two chains 760 MergeGainT CurGain = getBestMergeGain(ChainPred, ChainSucc, Edge); 761 if (CurGain.score() <= EPS) 762 continue; 763 764 if (BestGain < CurGain || 765 (std::abs(CurGain.score() - BestGain.score()) < EPS && 766 compareChainPairs(ChainPred, ChainSucc, BestChainPred, 767 BestChainSucc))) { 768 BestGain = CurGain; 769 BestChainPred = ChainPred; 770 BestChainSucc = ChainSucc; 771 // Stop early when the merge is as good as the previous one. 772 if (BestGain.score() == PrevScore) 773 break; 774 } 775 } 776 } 777 778 // Stop merging when there is no improvement. 779 if (BestGain.score() <= EPS) 780 break; 781 782 // Merge the best pair of chains. 783 PrevScore = BestGain.score(); 784 mergeChains(BestChainPred, BestChainSucc, BestGain.mergeOffset(), 785 BestGain.mergeType()); 786 } 787 } 788 789 /// Merge remaining nodes into chains w/o taking jump counts into 790 /// consideration. This allows to maintain the original node order in the 791 /// absence of profile data. 792 void mergeColdChains() { 793 for (size_t SrcBB = 0; SrcBB < NumNodes; SrcBB++) { 794 // Iterating in reverse order to make sure original fallthrough jumps are 795 // merged first; this might be beneficial for code size. 796 size_t NumSuccs = SuccNodes[SrcBB].size(); 797 for (size_t Idx = 0; Idx < NumSuccs; Idx++) { 798 size_t DstBB = SuccNodes[SrcBB][NumSuccs - Idx - 1]; 799 ChainT *SrcChain = AllNodes[SrcBB].CurChain; 800 ChainT *DstChain = AllNodes[DstBB].CurChain; 801 if (SrcChain != DstChain && !DstChain->isEntry() && 802 SrcChain->Nodes.back()->Index == SrcBB && 803 DstChain->Nodes.front()->Index == DstBB && 804 SrcChain->isCold() == DstChain->isCold()) { 805 mergeChains(SrcChain, DstChain, 0, MergeTypeT::X_Y); 806 } 807 } 808 } 809 } 810 811 /// Compute the Ext-TSP score for a given node order and a list of jumps. 812 double extTSPScore(const MergedVector<NodeT> &Nodes, 813 const MergedVector<JumpT> &Jumps) const { 814 if (Jumps.empty() || Nodes.empty()) 815 return 0.0; 816 817 uint64_t CurAddr = 0; 818 Nodes.forEach([&](const NodeT *Node) { 819 Node->EstimatedAddr = CurAddr; 820 CurAddr += Node->Size; 821 }); 822 823 double Score = 0; 824 Jumps.forEach([&](const JumpT *Jump) { 825 const NodeT *SrcBlock = Jump->Source; 826 const NodeT *DstBlock = Jump->Target; 827 Score += ::extTSPScore(SrcBlock->EstimatedAddr, SrcBlock->Size, 828 DstBlock->EstimatedAddr, Jump->ExecutionCount, 829 Jump->IsConditional); 830 }); 831 return Score; 832 } 833 834 /// Compute the gain of merging two chains. 835 /// 836 /// The function considers all possible ways of merging two chains and 837 /// computes the one having the largest increase in ExtTSP objective. The 838 /// result is a pair with the first element being the gain and the second 839 /// element being the corresponding merging type. 840 MergeGainT getBestMergeGain(ChainT *ChainPred, ChainT *ChainSucc, 841 ChainEdge *Edge) const { 842 if (Edge->hasCachedMergeGain(ChainPred, ChainSucc)) 843 return Edge->getCachedMergeGain(ChainPred, ChainSucc); 844 845 // Precompute jumps between ChainPred and ChainSucc. 846 MergedVector<JumpT> Jumps(Edge->jumps().begin(), Edge->jumps().end()); 847 assert(!Jumps.empty() && "trying to merge chains w/o jumps"); 848 ChainEdge *EdgePP = ChainPred->getEdge(ChainPred); 849 if (EdgePP != nullptr) 850 Jumps.append(EdgePP->jumps().begin(), EdgePP->jumps().end()); 851 852 // This object holds the best chosen gain of merging two chains. 853 MergeGainT Gain = MergeGainT(); 854 855 /// Given a merge offset and a list of merge types, try to merge two chains 856 /// and update Gain with a better alternative. 857 auto tryChainMerging = [&](size_t Offset, 858 const std::vector<MergeTypeT> &MergeTypes) { 859 // Skip merging corresponding to concatenation w/o splitting. 860 if (Offset == 0 || Offset == ChainPred->Nodes.size()) 861 return; 862 // Skip merging if it breaks Forced successors. 863 NodeT *Node = ChainPred->Nodes[Offset - 1]; 864 if (Node->ForcedSucc != nullptr) 865 return; 866 // Apply the merge, compute the corresponding gain, and update the best 867 // value, if the merge is beneficial. 868 for (const MergeTypeT &MergeType : MergeTypes) { 869 Gain.updateIfLessThan( 870 computeMergeGain(ChainPred, ChainSucc, Jumps, Offset, MergeType)); 871 } 872 }; 873 874 // Try to concatenate two chains w/o splitting. 875 Gain.updateIfLessThan( 876 computeMergeGain(ChainPred, ChainSucc, Jumps, 0, MergeTypeT::X_Y)); 877 878 // Attach (a part of) ChainPred before the first node of ChainSucc. 879 for (JumpT *Jump : ChainSucc->Nodes.front()->InJumps) { 880 const NodeT *SrcBlock = Jump->Source; 881 if (SrcBlock->CurChain != ChainPred) 882 continue; 883 size_t Offset = SrcBlock->CurIndex + 1; 884 tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::X2_X1_Y}); 885 } 886 887 // Attach (a part of) ChainPred after the last node of ChainSucc. 888 for (JumpT *Jump : ChainSucc->Nodes.back()->OutJumps) { 889 const NodeT *DstBlock = Jump->Target; 890 if (DstBlock->CurChain != ChainPred) 891 continue; 892 size_t Offset = DstBlock->CurIndex; 893 tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1}); 894 } 895 896 // Try to break ChainPred in various ways and concatenate with ChainSucc. 897 // In practice, applying X2_Y_X1 merging is almost never provides benefits; 898 // thus, we exclude it from consideration to reduce the search space. 899 for (size_t Offset = 1; Offset < ChainPred->Nodes.size(); Offset++) { 900 // Do not split the chain along a jump. 901 const NodeT *BB = ChainPred->Nodes[Offset - 1]; 902 const NodeT *BB2 = ChainPred->Nodes[Offset]; 903 if (BB->isSuccessor(BB2)) 904 continue; 905 906 tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1, 907 MergeTypeT::X2_X1_Y}); 908 } 909 Edge->setCachedMergeGain(ChainPred, ChainSucc, Gain); 910 return Gain; 911 } 912 913 /// Compute the score gain of merging two chains, respecting a given 914 /// merge 'type' and 'offset'. 915 /// 916 /// The two chains are not modified in the method. 917 MergeGainT computeMergeGain(const ChainT *ChainPred, const ChainT *ChainSucc, 918 const MergedVector<JumpT> &Jumps, 919 size_t MergeOffset, MergeTypeT MergeType) const { 920 MergedVector<NodeT> MergedNodes = 921 mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType); 922 923 // Do not allow a merge that does not preserve the original entry point. 924 if ((ChainPred->isEntry() || ChainSucc->isEntry()) && 925 !MergedNodes.getFirstNode()->isEntry()) 926 return MergeGainT(); 927 928 // The gain for the new chain. 929 double NewScore = extTSPScore(MergedNodes, Jumps); 930 double CurScore = ChainPred->Score; 931 return MergeGainT(NewScore - CurScore, MergeOffset, MergeType); 932 } 933 934 /// Merge chain From into chain Into, update the list of active chains, 935 /// adjacency information, and the corresponding cached values. 936 void mergeChains(ChainT *Into, ChainT *From, size_t MergeOffset, 937 MergeTypeT MergeType) { 938 assert(Into != From && "a chain cannot be merged with itself"); 939 940 // Merge the nodes. 941 MergedVector<NodeT> MergedNodes = 942 mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType); 943 Into->merge(From, MergedNodes.getNodes()); 944 945 // Merge the edges. 946 Into->mergeEdges(From); 947 From->clear(); 948 949 // Update cached ext-tsp score for the new chain. 950 ChainEdge *SelfEdge = Into->getEdge(Into); 951 if (SelfEdge != nullptr) { 952 MergedNodes = MergedVector<NodeT>(Into->Nodes.begin(), Into->Nodes.end()); 953 MergedVector<JumpT> MergedJumps(SelfEdge->jumps().begin(), 954 SelfEdge->jumps().end()); 955 Into->Score = extTSPScore(MergedNodes, MergedJumps); 956 } 957 958 // Remove the chain from the list of active chains. 959 llvm::erase_value(HotChains, From); 960 961 // Invalidate caches. 962 for (auto EdgeIt : Into->Edges) 963 EdgeIt.second->invalidateCache(); 964 } 965 966 /// Concatenate all chains into the final order. 967 std::vector<uint64_t> concatChains() { 968 // Collect chains and calculate density stats for their sorting. 969 std::vector<const ChainT *> SortedChains; 970 DenseMap<const ChainT *, double> ChainDensity; 971 for (ChainT &Chain : AllChains) { 972 if (!Chain.Nodes.empty()) { 973 SortedChains.push_back(&Chain); 974 // Using doubles to avoid overflow of ExecutionCounts. 975 double Size = 0; 976 double ExecutionCount = 0; 977 for (NodeT *Node : Chain.Nodes) { 978 Size += static_cast<double>(Node->Size); 979 ExecutionCount += static_cast<double>(Node->ExecutionCount); 980 } 981 assert(Size > 0 && "a chain of zero size"); 982 ChainDensity[&Chain] = ExecutionCount / Size; 983 } 984 } 985 986 // Sorting chains by density in the decreasing order. 987 std::sort(SortedChains.begin(), SortedChains.end(), 988 [&](const ChainT *L, const ChainT *R) { 989 // Place the entry point is at the beginning of the order. 990 if (L->isEntry() != R->isEntry()) 991 return L->isEntry(); 992 993 const double DL = ChainDensity[L]; 994 const double DR = ChainDensity[R]; 995 // Compare by density and break ties by chain identifiers. 996 return std::make_tuple(-DL, L->Id) < 997 std::make_tuple(-DR, R->Id); 998 }); 999 1000 // Collect the nodes in the order specified by their chains. 1001 std::vector<uint64_t> Order; 1002 Order.reserve(NumNodes); 1003 for (const ChainT *Chain : SortedChains) 1004 for (NodeT *Node : Chain->Nodes) 1005 Order.push_back(Node->Index); 1006 return Order; 1007 } 1008 1009 private: 1010 /// The number of nodes in the graph. 1011 const size_t NumNodes; 1012 1013 /// Successors of each node. 1014 std::vector<std::vector<uint64_t>> SuccNodes; 1015 1016 /// Predecessors of each node. 1017 std::vector<std::vector<uint64_t>> PredNodes; 1018 1019 /// All nodes (basic blocks) in the graph. 1020 std::vector<NodeT> AllNodes; 1021 1022 /// All jumps between the nodes. 1023 std::vector<JumpT> AllJumps; 1024 1025 /// All chains of nodes. 1026 std::vector<ChainT> AllChains; 1027 1028 /// All edges between the chains. 1029 std::vector<ChainEdge> AllEdges; 1030 1031 /// Active chains. The vector gets updated at runtime when chains are merged. 1032 std::vector<ChainT *> HotChains; 1033 }; 1034 1035 /// The implementation of the Cache-Directed Sort (CDS) algorithm for ordering 1036 /// functions represented by a call graph. 1037 class CDSortImpl { 1038 public: 1039 CDSortImpl(const CDSortConfig &Config, ArrayRef<uint64_t> NodeSizes, 1040 ArrayRef<uint64_t> NodeCounts, ArrayRef<EdgeCount> EdgeCounts, 1041 ArrayRef<uint64_t> EdgeOffsets) 1042 : Config(Config), NumNodes(NodeSizes.size()) { 1043 initialize(NodeSizes, NodeCounts, EdgeCounts, EdgeOffsets); 1044 } 1045 1046 /// Run the algorithm and return an ordered set of function clusters. 1047 std::vector<uint64_t> run() { 1048 // Merge pairs of chains while improving the objective. 1049 mergeChainPairs(); 1050 1051 LLVM_DEBUG(dbgs() << "Cache-directed function sorting reduced the number" 1052 << " of chains from " << NumNodes << " to " 1053 << HotChains.size() << "\n"); 1054 1055 // Collect nodes from all the chains. 1056 return concatChains(); 1057 } 1058 1059 private: 1060 /// Initialize the algorithm's data structures. 1061 void initialize(const ArrayRef<uint64_t> &NodeSizes, 1062 const ArrayRef<uint64_t> &NodeCounts, 1063 const ArrayRef<EdgeCount> &EdgeCounts, 1064 const ArrayRef<uint64_t> &EdgeOffsets) { 1065 // Initialize nodes. 1066 AllNodes.reserve(NumNodes); 1067 for (uint64_t Node = 0; Node < NumNodes; Node++) { 1068 uint64_t Size = std::max<uint64_t>(NodeSizes[Node], 1ULL); 1069 uint64_t ExecutionCount = NodeCounts[Node]; 1070 AllNodes.emplace_back(Node, Size, ExecutionCount); 1071 TotalSamples += ExecutionCount; 1072 if (ExecutionCount > 0) 1073 TotalSize += Size; 1074 } 1075 1076 // Initialize jumps between the nodes. 1077 SuccNodes.resize(NumNodes); 1078 PredNodes.resize(NumNodes); 1079 AllJumps.reserve(EdgeCounts.size()); 1080 for (size_t I = 0; I < EdgeCounts.size(); I++) { 1081 auto [Pred, Succ, Count] = EdgeCounts[I]; 1082 // Ignore recursive calls. 1083 if (Pred == Succ) 1084 continue; 1085 1086 SuccNodes[Pred].push_back(Succ); 1087 PredNodes[Succ].push_back(Pred); 1088 if (Count > 0) { 1089 NodeT &PredNode = AllNodes[Pred]; 1090 NodeT &SuccNode = AllNodes[Succ]; 1091 AllJumps.emplace_back(&PredNode, &SuccNode, Count); 1092 AllJumps.back().Offset = EdgeOffsets[I]; 1093 SuccNode.InJumps.push_back(&AllJumps.back()); 1094 PredNode.OutJumps.push_back(&AllJumps.back()); 1095 } 1096 } 1097 1098 // Initialize chains. 1099 AllChains.reserve(NumNodes); 1100 HotChains.reserve(NumNodes); 1101 for (NodeT &Node : AllNodes) { 1102 // Adjust execution counts. 1103 Node.ExecutionCount = std::max(Node.ExecutionCount, Node.inCount()); 1104 Node.ExecutionCount = std::max(Node.ExecutionCount, Node.outCount()); 1105 // Create chain. 1106 AllChains.emplace_back(Node.Index, &Node); 1107 Node.CurChain = &AllChains.back(); 1108 if (Node.ExecutionCount > 0) 1109 HotChains.push_back(&AllChains.back()); 1110 } 1111 1112 // Initialize chain edges. 1113 AllEdges.reserve(AllJumps.size()); 1114 for (NodeT &PredNode : AllNodes) { 1115 for (JumpT *Jump : PredNode.OutJumps) { 1116 NodeT *SuccNode = Jump->Target; 1117 ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain); 1118 // this edge is already present in the graph. 1119 if (CurEdge != nullptr) { 1120 assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr); 1121 CurEdge->appendJump(Jump); 1122 continue; 1123 } 1124 // this is a new edge. 1125 AllEdges.emplace_back(Jump); 1126 PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back()); 1127 SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back()); 1128 } 1129 } 1130 } 1131 1132 /// Merge pairs of chains while there is an improvement in the objective. 1133 void mergeChainPairs() { 1134 // Create a priority queue containing all edges ordered by the merge gain. 1135 auto GainComparator = [](ChainEdge *L, ChainEdge *R) { 1136 return std::make_tuple(-L->gain(), L->srcChain()->Id, L->dstChain()->Id) < 1137 std::make_tuple(-R->gain(), R->srcChain()->Id, R->dstChain()->Id); 1138 }; 1139 std::set<ChainEdge *, decltype(GainComparator)> Queue(GainComparator); 1140 1141 // Insert the edges into the queue. 1142 for (ChainT *ChainPred : HotChains) { 1143 for (const auto &[_, Edge] : ChainPred->Edges) { 1144 // Ignore self-edges. 1145 if (Edge->isSelfEdge()) 1146 continue; 1147 // Ignore already processed edges. 1148 if (Edge->gain() != -1.0) 1149 continue; 1150 1151 // Compute the gain of merging the two chains. 1152 MergeGainT Gain = getBestMergeGain(Edge); 1153 Edge->setMergeGain(Gain); 1154 1155 if (Edge->gain() > EPS) 1156 Queue.insert(Edge); 1157 } 1158 } 1159 1160 // Merge the chains while the gain of merging is positive. 1161 while (!Queue.empty()) { 1162 // Extract the best (top) edge for merging. 1163 ChainEdge *BestEdge = *Queue.begin(); 1164 Queue.erase(Queue.begin()); 1165 // Ignore self-edges. 1166 if (BestEdge->isSelfEdge()) 1167 continue; 1168 // Ignore edges with non-positive gains. 1169 if (BestEdge->gain() <= EPS) 1170 continue; 1171 1172 ChainT *BestSrcChain = BestEdge->srcChain(); 1173 ChainT *BestDstChain = BestEdge->dstChain(); 1174 1175 // Remove outdated edges from the queue. 1176 for (const auto &[_, ChainEdge] : BestSrcChain->Edges) 1177 Queue.erase(ChainEdge); 1178 for (const auto &[_, ChainEdge] : BestDstChain->Edges) 1179 Queue.erase(ChainEdge); 1180 1181 // Merge the best pair of chains. 1182 MergeGainT BestGain = BestEdge->getMergeGain(); 1183 mergeChains(BestSrcChain, BestDstChain, BestGain.mergeOffset(), 1184 BestGain.mergeType()); 1185 1186 // Insert newly created edges into the queue. 1187 for (const auto &[_, Edge] : BestSrcChain->Edges) { 1188 // Ignore loop edges. 1189 if (Edge->isSelfEdge()) 1190 continue; 1191 1192 // Compute the gain of merging the two chains. 1193 MergeGainT Gain = getBestMergeGain(Edge); 1194 Edge->setMergeGain(Gain); 1195 1196 if (Edge->gain() > EPS) 1197 Queue.insert(Edge); 1198 } 1199 } 1200 } 1201 1202 /// Compute the gain of merging two chains. 1203 /// 1204 /// The function considers all possible ways of merging two chains and 1205 /// computes the one having the largest increase in ExtTSP objective. The 1206 /// result is a pair with the first element being the gain and the second 1207 /// element being the corresponding merging type. 1208 MergeGainT getBestMergeGain(ChainEdge *Edge) const { 1209 // Precompute jumps between ChainPred and ChainSucc. 1210 auto Jumps = Edge->jumps(); 1211 assert(!Jumps.empty() && "trying to merge chains w/o jumps"); 1212 ChainT *SrcChain = Edge->srcChain(); 1213 ChainT *DstChain = Edge->dstChain(); 1214 1215 // This object holds the best currently chosen gain of merging two chains. 1216 MergeGainT Gain = MergeGainT(); 1217 1218 /// Given a list of merge types, try to merge two chains and update Gain 1219 /// with a better alternative. 1220 auto tryChainMerging = [&](const std::vector<MergeTypeT> &MergeTypes) { 1221 // Apply the merge, compute the corresponding gain, and update the best 1222 // value, if the merge is beneficial. 1223 for (const MergeTypeT &MergeType : MergeTypes) { 1224 MergeGainT NewGain = 1225 computeMergeGain(SrcChain, DstChain, Jumps, MergeType); 1226 1227 // When forward and backward gains are the same, prioritize merging that 1228 // preserves the original order of the functions in the binary. 1229 if (std::abs(Gain.score() - NewGain.score()) < EPS) { 1230 if ((MergeType == MergeTypeT::X_Y && SrcChain->Id < DstChain->Id) || 1231 (MergeType == MergeTypeT::Y_X && SrcChain->Id > DstChain->Id)) { 1232 Gain = NewGain; 1233 } 1234 } else if (NewGain.score() > Gain.score() + EPS) { 1235 Gain = NewGain; 1236 } 1237 } 1238 }; 1239 1240 // Try to concatenate two chains w/o splitting. 1241 tryChainMerging({MergeTypeT::X_Y, MergeTypeT::Y_X}); 1242 1243 return Gain; 1244 } 1245 1246 /// Compute the score gain of merging two chains, respecting a given type. 1247 /// 1248 /// The two chains are not modified in the method. 1249 MergeGainT computeMergeGain(ChainT *ChainPred, ChainT *ChainSucc, 1250 const std::vector<JumpT *> &Jumps, 1251 MergeTypeT MergeType) const { 1252 // This doesn't depend on the ordering of the nodes 1253 double FreqGain = freqBasedLocalityGain(ChainPred, ChainSucc); 1254 1255 // Merge offset is always 0, as the chains are not split. 1256 size_t MergeOffset = 0; 1257 auto MergedBlocks = 1258 mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType); 1259 double DistGain = distBasedLocalityGain(MergedBlocks, Jumps); 1260 1261 double GainScore = DistGain + Config.FrequencyScale * FreqGain; 1262 // Scale the result to increase the importance of merging short chains. 1263 if (GainScore >= 0.0) 1264 GainScore /= std::min(ChainPred->Size, ChainSucc->Size); 1265 1266 return MergeGainT(GainScore, MergeOffset, MergeType); 1267 } 1268 1269 /// Compute the change of the frequency locality after merging the chains. 1270 double freqBasedLocalityGain(ChainT *ChainPred, ChainT *ChainSucc) const { 1271 auto missProbability = [&](double ChainDensity) { 1272 double PageSamples = ChainDensity * Config.CacheSize; 1273 if (PageSamples >= TotalSamples) 1274 return 0.0; 1275 double P = PageSamples / TotalSamples; 1276 return pow(1.0 - P, static_cast<double>(Config.CacheEntries)); 1277 }; 1278 1279 // Cache misses on the chains before merging. 1280 double CurScore = 1281 ChainPred->ExecutionCount * missProbability(ChainPred->density()) + 1282 ChainSucc->ExecutionCount * missProbability(ChainSucc->density()); 1283 1284 // Cache misses on the merged chain 1285 double MergedCounts = ChainPred->ExecutionCount + ChainSucc->ExecutionCount; 1286 double MergedSize = ChainPred->Size + ChainSucc->Size; 1287 double MergedDensity = static_cast<double>(MergedCounts) / MergedSize; 1288 double NewScore = MergedCounts * missProbability(MergedDensity); 1289 1290 return CurScore - NewScore; 1291 } 1292 1293 /// Compute the distance locality for a jump / call. 1294 double distScore(uint64_t SrcAddr, uint64_t DstAddr, uint64_t Count) const { 1295 uint64_t Dist = SrcAddr <= DstAddr ? DstAddr - SrcAddr : SrcAddr - DstAddr; 1296 double D = Dist == 0 ? 0.1 : static_cast<double>(Dist); 1297 return static_cast<double>(Count) * std::pow(D, -Config.DistancePower); 1298 } 1299 1300 /// Compute the change of the distance locality after merging the chains. 1301 double distBasedLocalityGain(const MergedVector<NodeT> &MergedBlocks, 1302 const std::vector<JumpT *> &Jumps) const { 1303 if (Jumps.empty()) 1304 return 0.0; 1305 uint64_t CurAddr = 0; 1306 MergedBlocks.forEach([&](const NodeT *Node) { 1307 Node->EstimatedAddr = CurAddr; 1308 CurAddr += Node->Size; 1309 }); 1310 1311 double CurScore = 0; 1312 double NewScore = 0; 1313 for (const JumpT *Arc : Jumps) { 1314 uint64_t SrcAddr = Arc->Source->EstimatedAddr + Arc->Offset; 1315 uint64_t DstAddr = Arc->Target->EstimatedAddr; 1316 NewScore += distScore(SrcAddr, DstAddr, Arc->ExecutionCount); 1317 CurScore += distScore(0, TotalSize, Arc->ExecutionCount); 1318 } 1319 return NewScore - CurScore; 1320 } 1321 1322 /// Merge chain From into chain Into, update the list of active chains, 1323 /// adjacency information, and the corresponding cached values. 1324 void mergeChains(ChainT *Into, ChainT *From, size_t MergeOffset, 1325 MergeTypeT MergeType) { 1326 assert(Into != From && "a chain cannot be merged with itself"); 1327 1328 // Merge the nodes. 1329 MergedVector<NodeT> MergedNodes = 1330 mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType); 1331 Into->merge(From, MergedNodes.getNodes()); 1332 1333 // Merge the edges. 1334 Into->mergeEdges(From); 1335 From->clear(); 1336 1337 // Remove the chain from the list of active chains. 1338 llvm::erase_value(HotChains, From); 1339 } 1340 1341 /// Concatenate all chains into the final order. 1342 std::vector<uint64_t> concatChains() { 1343 // Collect chains and calculate density stats for their sorting. 1344 std::vector<const ChainT *> SortedChains; 1345 DenseMap<const ChainT *, double> ChainDensity; 1346 for (ChainT &Chain : AllChains) { 1347 if (!Chain.Nodes.empty()) { 1348 SortedChains.push_back(&Chain); 1349 // Using doubles to avoid overflow of ExecutionCounts. 1350 double Size = 0; 1351 double ExecutionCount = 0; 1352 for (NodeT *Node : Chain.Nodes) { 1353 Size += static_cast<double>(Node->Size); 1354 ExecutionCount += static_cast<double>(Node->ExecutionCount); 1355 } 1356 assert(Size > 0 && "a chain of zero size"); 1357 ChainDensity[&Chain] = ExecutionCount / Size; 1358 } 1359 } 1360 1361 // Sort chains by density in the decreasing order. 1362 std::sort(SortedChains.begin(), SortedChains.end(), 1363 [&](const ChainT *L, const ChainT *R) { 1364 const double DL = ChainDensity[L]; 1365 const double DR = ChainDensity[R]; 1366 // Compare by density and break ties by chain identifiers. 1367 return std::make_tuple(-DL, L->Id) < 1368 std::make_tuple(-DR, R->Id); 1369 }); 1370 1371 // Collect the nodes in the order specified by their chains. 1372 std::vector<uint64_t> Order; 1373 Order.reserve(NumNodes); 1374 for (const ChainT *Chain : SortedChains) 1375 for (NodeT *Node : Chain->Nodes) 1376 Order.push_back(Node->Index); 1377 return Order; 1378 } 1379 1380 private: 1381 /// Config for the algorithm. 1382 const CDSortConfig Config; 1383 1384 /// The number of nodes in the graph. 1385 const size_t NumNodes; 1386 1387 /// Successors of each node. 1388 std::vector<std::vector<uint64_t>> SuccNodes; 1389 1390 /// Predecessors of each node. 1391 std::vector<std::vector<uint64_t>> PredNodes; 1392 1393 /// All nodes (functions) in the graph. 1394 std::vector<NodeT> AllNodes; 1395 1396 /// All jumps (function calls) between the nodes. 1397 std::vector<JumpT> AllJumps; 1398 1399 /// All chains of nodes. 1400 std::vector<ChainT> AllChains; 1401 1402 /// All edges between the chains. 1403 std::vector<ChainEdge> AllEdges; 1404 1405 /// Active chains. The vector gets updated at runtime when chains are merged. 1406 std::vector<ChainT *> HotChains; 1407 1408 /// The total number of samples in the graph. 1409 uint64_t TotalSamples{0}; 1410 1411 /// The total size of the nodes in the graph. 1412 uint64_t TotalSize{0}; 1413 }; 1414 1415 } // end of anonymous namespace 1416 1417 std::vector<uint64_t> 1418 codelayout::computeExtTspLayout(ArrayRef<uint64_t> NodeSizes, 1419 ArrayRef<uint64_t> NodeCounts, 1420 ArrayRef<EdgeCount> EdgeCounts) { 1421 // Verify correctness of the input data. 1422 assert(NodeCounts.size() == NodeSizes.size() && "Incorrect input"); 1423 assert(NodeSizes.size() > 2 && "Incorrect input"); 1424 1425 // Apply the reordering algorithm. 1426 ExtTSPImpl Alg(NodeSizes, NodeCounts, EdgeCounts); 1427 std::vector<uint64_t> Result = Alg.run(); 1428 1429 // Verify correctness of the output. 1430 assert(Result.front() == 0 && "Original entry point is not preserved"); 1431 assert(Result.size() == NodeSizes.size() && "Incorrect size of layout"); 1432 return Result; 1433 } 1434 1435 double codelayout::calcExtTspScore(ArrayRef<uint64_t> Order, 1436 ArrayRef<uint64_t> NodeSizes, 1437 ArrayRef<uint64_t> NodeCounts, 1438 ArrayRef<EdgeCount> EdgeCounts) { 1439 // Estimate addresses of the blocks in memory. 1440 std::vector<uint64_t> Addr(NodeSizes.size(), 0); 1441 for (size_t Idx = 1; Idx < Order.size(); Idx++) { 1442 Addr[Order[Idx]] = Addr[Order[Idx - 1]] + NodeSizes[Order[Idx - 1]]; 1443 } 1444 std::vector<uint64_t> OutDegree(NodeSizes.size(), 0); 1445 for (auto Edge : EdgeCounts) 1446 ++OutDegree[Edge.src]; 1447 1448 // Increase the score for each jump. 1449 double Score = 0; 1450 for (auto Edge : EdgeCounts) { 1451 bool IsConditional = OutDegree[Edge.src] > 1; 1452 Score += ::extTSPScore(Addr[Edge.src], NodeSizes[Edge.src], Addr[Edge.dst], 1453 Edge.count, IsConditional); 1454 } 1455 return Score; 1456 } 1457 1458 double codelayout::calcExtTspScore(ArrayRef<uint64_t> NodeSizes, 1459 ArrayRef<uint64_t> NodeCounts, 1460 ArrayRef<EdgeCount> EdgeCounts) { 1461 std::vector<uint64_t> Order(NodeSizes.size()); 1462 for (size_t Idx = 0; Idx < NodeSizes.size(); Idx++) { 1463 Order[Idx] = Idx; 1464 } 1465 return calcExtTspScore(Order, NodeSizes, NodeCounts, EdgeCounts); 1466 } 1467 1468 std::vector<uint64_t> codelayout::computeCacheDirectedLayout( 1469 const CDSortConfig &Config, ArrayRef<uint64_t> FuncSizes, 1470 ArrayRef<uint64_t> FuncCounts, ArrayRef<EdgeCount> CallCounts, 1471 ArrayRef<uint64_t> CallOffsets) { 1472 // Verify correctness of the input data. 1473 assert(FuncCounts.size() == FuncSizes.size() && "Incorrect input"); 1474 1475 // Apply the reordering algorithm. 1476 CDSortImpl Alg(Config, FuncSizes, FuncCounts, CallCounts, CallOffsets); 1477 std::vector<uint64_t> Result = Alg.run(); 1478 assert(Result.size() == FuncSizes.size() && "Incorrect size of layout"); 1479 return Result; 1480 } 1481 1482 std::vector<uint64_t> codelayout::computeCacheDirectedLayout( 1483 ArrayRef<uint64_t> FuncSizes, ArrayRef<uint64_t> FuncCounts, 1484 ArrayRef<EdgeCount> CallCounts, ArrayRef<uint64_t> CallOffsets) { 1485 CDSortConfig Config; 1486 // Populate the config from the command-line options. 1487 if (CacheEntries.getNumOccurrences() > 0) 1488 Config.CacheEntries = CacheEntries; 1489 if (CacheSize.getNumOccurrences() > 0) 1490 Config.CacheSize = CacheSize; 1491 if (DistancePower.getNumOccurrences() > 0) 1492 Config.DistancePower = DistancePower; 1493 if (FrequencyScale.getNumOccurrences() > 0) 1494 Config.FrequencyScale = FrequencyScale; 1495 return computeCacheDirectedLayout(Config, FuncSizes, FuncCounts, CallCounts, 1496 CallOffsets); 1497 } 1498