1b842725cSMichael J. Spencer //===- CallGraphSort.cpp --------------------------------------------------===// 2b842725cSMichael J. Spencer // 32946cd70SChandler Carruth // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 42946cd70SChandler Carruth // See https://llvm.org/LICENSE.txt for license information. 52946cd70SChandler Carruth // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6b842725cSMichael J. Spencer // 7b842725cSMichael J. Spencer //===----------------------------------------------------------------------===// 8b842725cSMichael J. Spencer /// 9904b3f66Sspupyrev /// The file is responsible for sorting sections using LLVM call graph profile 10904b3f66Sspupyrev /// data by placing frequently executed code sections together. The goal of the 11904b3f66Sspupyrev /// placement is to improve the runtime performance of the final executable by 12904b3f66Sspupyrev /// arranging code sections so that i-TLB misses and i-cache misses are reduced. 13904b3f66Sspupyrev /// 14904b3f66Sspupyrev /// The algorithm first builds a call graph based on the profile data and then 15904b3f66Sspupyrev /// iteratively merges "chains" (ordered lists) of input sections which will be 16904b3f66Sspupyrev /// laid out as a unit. There are two implementations for deciding how to 17904b3f66Sspupyrev /// merge a pair of chains: 18904b3f66Sspupyrev /// - a simpler one, referred to as Call-Chain Clustering (C^3), that follows 19904b3f66Sspupyrev /// "Optimizing Function Placement for Large-Scale Data-Center Applications" 20b842725cSMichael J. Spencer /// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf 21904b3f66Sspupyrev /// - a more advanced one, referred to as Cache-Directed-Sort (CDSort), which 22904b3f66Sspupyrev /// typically produces layouts with higher locality, and hence, yields fewer 23904b3f66Sspupyrev /// instruction cache misses on large binaries. 24b842725cSMichael J. Spencer //===----------------------------------------------------------------------===// 25b842725cSMichael J. Spencer 26b842725cSMichael J. Spencer #include "CallGraphSort.h" 2738fbedabSFangrui Song #include "InputFiles.h" 2827bb7990SFangrui Song #include "InputSection.h" 29b842725cSMichael J. Spencer #include "Symbols.h" 3027bb7990SFangrui Song #include "llvm/Support/FileSystem.h" 31904b3f66Sspupyrev #include "llvm/Transforms/Utils/CodeLayout.h" 32b842725cSMichael J. Spencer 337588cf09SFangrui Song #include <numeric> 347588cf09SFangrui Song 35b842725cSMichael J. Spencer using namespace llvm; 3607837b8fSFangrui Song using namespace lld; 3707837b8fSFangrui Song using namespace lld::elf; 38b842725cSMichael J. Spencer 39b842725cSMichael J. Spencer namespace { 40b842725cSMichael J. Spencer struct Edge { 413837f427SRui Ueyama int from; 423837f427SRui Ueyama uint64_t weight; 43b842725cSMichael J. Spencer }; 44b842725cSMichael J. Spencer 45b842725cSMichael J. Spencer struct Cluster { 467588cf09SFangrui Song Cluster(int sec, size_t s) : next(sec), prev(sec), size(s) {} 47b842725cSMichael J. Spencer 48b842725cSMichael J. Spencer double getDensity() const { 493837f427SRui Ueyama if (size == 0) 50b842725cSMichael J. Spencer return 0; 513837f427SRui Ueyama return double(weight) / double(size); 52b842725cSMichael J. Spencer } 53b842725cSMichael J. Spencer 547588cf09SFangrui Song int next; 557588cf09SFangrui Song int prev; 56763671f3SZequan Wu uint64_t size; 573837f427SRui Ueyama uint64_t weight = 0; 583837f427SRui Ueyama uint64_t initialWeight = 0; 593837f427SRui Ueyama Edge bestPred = {-1, 0}; 60b842725cSMichael J. Spencer }; 61b842725cSMichael J. Spencer 62904b3f66Sspupyrev /// Implementation of the Call-Chain Clustering (C^3). The goal of this 63904b3f66Sspupyrev /// algorithm is to improve runtime performance of the executable by arranging 64904b3f66Sspupyrev /// code sections such that page table and i-cache misses are minimized. 65904b3f66Sspupyrev /// 66904b3f66Sspupyrev /// Definitions: 67904b3f66Sspupyrev /// * Cluster 68904b3f66Sspupyrev /// * An ordered list of input sections which are laid out as a unit. At the 69904b3f66Sspupyrev /// beginning of the algorithm each input section has its own cluster and 70904b3f66Sspupyrev /// the weight of the cluster is the sum of the weight of all incoming 71904b3f66Sspupyrev /// edges. 72904b3f66Sspupyrev /// * Call-Chain Clustering (C³) Heuristic 73904b3f66Sspupyrev /// * Defines when and how clusters are combined. Pick the highest weighted 74904b3f66Sspupyrev /// input section then add it to its most likely predecessor if it wouldn't 75904b3f66Sspupyrev /// penalize it too much. 76904b3f66Sspupyrev /// * Density 77904b3f66Sspupyrev /// * The weight of the cluster divided by the size of the cluster. This is a 78904b3f66Sspupyrev /// proxy for the amount of execution time spent per byte of the cluster. 79904b3f66Sspupyrev /// 80904b3f66Sspupyrev /// It does so given a call graph profile by the following: 81904b3f66Sspupyrev /// * Build a weighted call graph from the call graph profile 82904b3f66Sspupyrev /// * Sort input sections by weight 83904b3f66Sspupyrev /// * For each input section starting with the highest weight 84904b3f66Sspupyrev /// * Find its most likely predecessor cluster 85904b3f66Sspupyrev /// * Check if the combined cluster would be too large, or would have too low 86904b3f66Sspupyrev /// a density. 87904b3f66Sspupyrev /// * If not, then combine the clusters. 88904b3f66Sspupyrev /// * Sort non-empty clusters by density 89b842725cSMichael J. Spencer class CallGraphSort { 90b842725cSMichael J. Spencer public: 911c28f311SFangrui Song CallGraphSort(Ctx &); 92b842725cSMichael J. Spencer 93b842725cSMichael J. Spencer DenseMap<const InputSectionBase *, int> run(); 94b842725cSMichael J. Spencer 95b842725cSMichael J. Spencer private: 961c28f311SFangrui Song Ctx &ctx; 973837f427SRui Ueyama std::vector<Cluster> clusters; 983837f427SRui Ueyama std::vector<const InputSectionBase *> sections; 99b842725cSMichael J. Spencer }; 100b842725cSMichael J. Spencer 1015976a3f5SNico Weber // Maximum amount the combined cluster density can be worse than the original 102b842725cSMichael J. Spencer // cluster to consider merging. 103b842725cSMichael J. Spencer constexpr int MAX_DENSITY_DEGRADATION = 8; 104b842725cSMichael J. Spencer 105b842725cSMichael J. Spencer // Maximum cluster size in bytes. 106b842725cSMichael J. Spencer constexpr uint64_t MAX_CLUSTER_SIZE = 1024 * 1024; 107b842725cSMichael J. Spencer } // end anonymous namespace 108b842725cSMichael J. Spencer 10968b9f45fSRui Ueyama using SectionPair = 11068b9f45fSRui Ueyama std::pair<const InputSectionBase *, const InputSectionBase *>; 1113d354081SRui Ueyama 112b8248dacSFangrui Song // Take the edge list in ctx.arg.callGraphProfile, resolve symbol names to 113b842725cSMichael J. Spencer // Symbols, and generate a graph between InputSections with the provided 114b842725cSMichael J. Spencer // weights. 1151c28f311SFangrui Song CallGraphSort::CallGraphSort(Ctx &ctx) : ctx(ctx) { 1166f482010SFangrui Song MapVector<SectionPair, uint64_t> &profile = ctx.arg.callGraphProfile; 1173837f427SRui Ueyama DenseMap<const InputSectionBase *, int> secToCluster; 118b842725cSMichael J. Spencer 1193837f427SRui Ueyama auto getOrCreateNode = [&](const InputSectionBase *isec) -> int { 1207588cf09SFangrui Song auto res = secToCluster.try_emplace(isec, clusters.size()); 1213837f427SRui Ueyama if (res.second) { 1223837f427SRui Ueyama sections.push_back(isec); 1233837f427SRui Ueyama clusters.emplace_back(clusters.size(), isec->getSize()); 124b842725cSMichael J. Spencer } 1253837f427SRui Ueyama return res.first->second; 126b842725cSMichael J. Spencer }; 127b842725cSMichael J. Spencer 128b842725cSMichael J. Spencer // Create the graph. 1293837f427SRui Ueyama for (std::pair<SectionPair, uint64_t> &c : profile) { 130e1b6b5beSFangrui Song const auto *fromSB = cast<InputSectionBase>(c.first.first); 131e1b6b5beSFangrui Song const auto *toSB = cast<InputSectionBase>(c.first.second); 1323837f427SRui Ueyama uint64_t weight = c.second; 133b842725cSMichael J. Spencer 134b842725cSMichael J. Spencer // Ignore edges between input sections belonging to different output 135b842725cSMichael J. Spencer // sections. This is done because otherwise we would end up with clusters 136b842725cSMichael J. Spencer // containing input sections that can't actually be placed adjacently in the 137b842725cSMichael J. Spencer // output. This messes with the cluster size and density calculations. We 138b842725cSMichael J. Spencer // would also end up moving input sections in other output sections without 139b842725cSMichael J. Spencer // moving them closer to what calls them. 1403837f427SRui Ueyama if (fromSB->getOutputSection() != toSB->getOutputSection()) 141b842725cSMichael J. Spencer continue; 142b842725cSMichael J. Spencer 1433837f427SRui Ueyama int from = getOrCreateNode(fromSB); 1443837f427SRui Ueyama int to = getOrCreateNode(toSB); 145b842725cSMichael J. Spencer 1463837f427SRui Ueyama clusters[to].weight += weight; 147b842725cSMichael J. Spencer 1483837f427SRui Ueyama if (from == to) 149b842725cSMichael J. Spencer continue; 150b842725cSMichael J. Spencer 151f0eedbceSGeorge Rimar // Remember the best edge. 1523837f427SRui Ueyama Cluster &toC = clusters[to]; 1533837f427SRui Ueyama if (toC.bestPred.from == -1 || toC.bestPred.weight < weight) { 1543837f427SRui Ueyama toC.bestPred.from = from; 1553837f427SRui Ueyama toC.bestPred.weight = weight; 156f0eedbceSGeorge Rimar } 157b842725cSMichael J. Spencer } 1583837f427SRui Ueyama for (Cluster &c : clusters) 1593837f427SRui Ueyama c.initialWeight = c.weight; 160b842725cSMichael J. Spencer } 161b842725cSMichael J. Spencer 162b842725cSMichael J. Spencer // It's bad to merge clusters which would degrade the density too much. 1633837f427SRui Ueyama static bool isNewDensityBad(Cluster &a, Cluster &b) { 1643837f427SRui Ueyama double newDensity = double(a.weight + b.weight) / double(a.size + b.size); 1653837f427SRui Ueyama return newDensity < a.getDensity() / MAX_DENSITY_DEGRADATION; 166b842725cSMichael J. Spencer } 167b842725cSMichael J. Spencer 1687588cf09SFangrui Song // Find the leader of V's belonged cluster (represented as an equivalence 1697588cf09SFangrui Song // class). We apply union-find path-halving technique (simple to implement) in 1707588cf09SFangrui Song // the meantime as it decreases depths and the time complexity. 171a041ce3eSFangrui Song static int getLeader(int *leaders, int v) { 1727588cf09SFangrui Song while (leaders[v] != v) { 1737588cf09SFangrui Song leaders[v] = leaders[leaders[v]]; 1747588cf09SFangrui Song v = leaders[v]; 1757588cf09SFangrui Song } 1767588cf09SFangrui Song return v; 1777588cf09SFangrui Song } 1787588cf09SFangrui Song 1797588cf09SFangrui Song static void mergeClusters(std::vector<Cluster> &cs, Cluster &into, int intoIdx, 1807588cf09SFangrui Song Cluster &from, int fromIdx) { 1817588cf09SFangrui Song int tail1 = into.prev, tail2 = from.prev; 1827588cf09SFangrui Song into.prev = tail2; 1837588cf09SFangrui Song cs[tail2].next = intoIdx; 1847588cf09SFangrui Song from.prev = tail1; 1857588cf09SFangrui Song cs[tail1].next = fromIdx; 1863837f427SRui Ueyama into.size += from.size; 1873837f427SRui Ueyama into.weight += from.weight; 1883837f427SRui Ueyama from.size = 0; 1893837f427SRui Ueyama from.weight = 0; 190b842725cSMichael J. Spencer } 191b842725cSMichael J. Spencer 192b842725cSMichael J. Spencer // Group InputSections into clusters using the Call-Chain Clustering heuristic 193b842725cSMichael J. Spencer // then sort the clusters by density. 1947588cf09SFangrui Song DenseMap<const InputSectionBase *, int> CallGraphSort::run() { 1957588cf09SFangrui Song std::vector<int> sorted(clusters.size()); 196a041ce3eSFangrui Song std::unique_ptr<int[]> leaders(new int[clusters.size()]); 197b842725cSMichael J. Spencer 198a041ce3eSFangrui Song std::iota(leaders.get(), leaders.get() + clusters.size(), 0); 1997588cf09SFangrui Song std::iota(sorted.begin(), sorted.end(), 0); 2007588cf09SFangrui Song llvm::stable_sort(sorted, [&](int a, int b) { 2013837f427SRui Ueyama return clusters[a].getDensity() > clusters[b].getDensity(); 202b842725cSMichael J. Spencer }); 203b842725cSMichael J. Spencer 2047588cf09SFangrui Song for (int l : sorted) { 2057588cf09SFangrui Song // The cluster index is the same as the index of its leader here because 2067588cf09SFangrui Song // clusters[L] has not been merged into another cluster yet. 2077588cf09SFangrui Song Cluster &c = clusters[l]; 208b842725cSMichael J. Spencer 209f0eedbceSGeorge Rimar // Don't consider merging if the edge is unlikely. 2103837f427SRui Ueyama if (c.bestPred.from == -1 || c.bestPred.weight * 10 <= c.initialWeight) 211b842725cSMichael J. Spencer continue; 212b842725cSMichael J. Spencer 213a041ce3eSFangrui Song int predL = getLeader(leaders.get(), c.bestPred.from); 2147588cf09SFangrui Song if (l == predL) 215b842725cSMichael J. Spencer continue; 216b842725cSMichael J. Spencer 2177588cf09SFangrui Song Cluster *predC = &clusters[predL]; 2183837f427SRui Ueyama if (c.size + predC->size > MAX_CLUSTER_SIZE) 219b842725cSMichael J. Spencer continue; 220b842725cSMichael J. Spencer 2213837f427SRui Ueyama if (isNewDensityBad(*predC, c)) 222b842725cSMichael J. Spencer continue; 223b842725cSMichael J. Spencer 2247588cf09SFangrui Song leaders[l] = predL; 2257588cf09SFangrui Song mergeClusters(clusters, *predC, predL, c, l); 226b842725cSMichael J. Spencer } 227b842725cSMichael J. Spencer 2287588cf09SFangrui Song // Sort remaining non-empty clusters by density. 2297588cf09SFangrui Song sorted.clear(); 2307588cf09SFangrui Song for (int i = 0, e = (int)clusters.size(); i != e; ++i) 2317588cf09SFangrui Song if (clusters[i].size > 0) 2327588cf09SFangrui Song sorted.push_back(i); 2337588cf09SFangrui Song llvm::stable_sort(sorted, [&](int a, int b) { 2347588cf09SFangrui Song return clusters[a].getDensity() > clusters[b].getDensity(); 23597df22f1SGeorge Rimar }); 236b842725cSMichael J. Spencer 2373837f427SRui Ueyama DenseMap<const InputSectionBase *, int> orderMap; 238*bcc1e584SFangrui Song int curOrder = -clusters.size(); 239763671f3SZequan Wu for (int leader : sorted) { 2407588cf09SFangrui Song for (int i = leader;;) { 2417588cf09SFangrui Song orderMap[sections[i]] = curOrder++; 2427588cf09SFangrui Song i = clusters[i].next; 2437588cf09SFangrui Song if (i == leader) 2447588cf09SFangrui Song break; 2457588cf09SFangrui Song } 246763671f3SZequan Wu } 2476f482010SFangrui Song if (!ctx.arg.printSymbolOrder.empty()) { 2483837f427SRui Ueyama std::error_code ec; 2496f482010SFangrui Song raw_fd_ostream os(ctx.arg.printSymbolOrder, ec, sys::fs::OF_None); 2503837f427SRui Ueyama if (ec) { 25109c2c5e1SFangrui Song ErrAlways(ctx) << "cannot open " << ctx.arg.printSymbolOrder << ": " 25209c2c5e1SFangrui Song << ec.message(); 2533837f427SRui Ueyama return orderMap; 254432030e8SRui Ueyama } 255432030e8SRui Ueyama 25647cfe8f3SFangrui Song // Print the symbols ordered by C3, in the order of increasing curOrder 25747cfe8f3SFangrui Song // Instead of sorting all the orderMap, just repeat the loops above. 2587588cf09SFangrui Song for (int leader : sorted) 2597588cf09SFangrui Song for (int i = leader;;) { 260432030e8SRui Ueyama // Search all the symbols in the file of the section 261432030e8SRui Ueyama // and find out a Defined symbol with name that is within the section. 2627588cf09SFangrui Song for (Symbol *sym : sections[i]->file->getSymbols()) 2633837f427SRui Ueyama if (!sym->isSection()) // Filter out section-type symbols here. 2643837f427SRui Ueyama if (auto *d = dyn_cast<Defined>(sym)) 2657588cf09SFangrui Song if (sections[i] == d->section) 2663837f427SRui Ueyama os << sym->getName() << "\n"; 2677588cf09SFangrui Song i = clusters[i].next; 2687588cf09SFangrui Song if (i == leader) 2697588cf09SFangrui Song break; 2707588cf09SFangrui Song } 271432030e8SRui Ueyama } 272432030e8SRui Ueyama 2733837f427SRui Ueyama return orderMap; 274b842725cSMichael J. Spencer } 275b842725cSMichael J. Spencer 276904b3f66Sspupyrev // Sort sections by the profile data using the Cache-Directed Sort algorithm. 277904b3f66Sspupyrev // The placement is done by optimizing the locality by co-locating frequently 278904b3f66Sspupyrev // executed code sections together. 2798d2b070fSFangrui Song static DenseMap<const InputSectionBase *, int> 2808d2b070fSFangrui Song computeCacheDirectedSortOrder(Ctx &ctx) { 281904b3f66Sspupyrev SmallVector<uint64_t, 0> funcSizes; 282904b3f66Sspupyrev SmallVector<uint64_t, 0> funcCounts; 283904b3f66Sspupyrev SmallVector<codelayout::EdgeCount, 0> callCounts; 284904b3f66Sspupyrev SmallVector<uint64_t, 0> callOffsets; 285904b3f66Sspupyrev SmallVector<const InputSectionBase *, 0> sections; 286904b3f66Sspupyrev DenseMap<const InputSectionBase *, size_t> secToTargetId; 287904b3f66Sspupyrev 288904b3f66Sspupyrev auto getOrCreateNode = [&](const InputSectionBase *inSec) -> size_t { 289904b3f66Sspupyrev auto res = secToTargetId.try_emplace(inSec, sections.size()); 290904b3f66Sspupyrev if (res.second) { 291904b3f66Sspupyrev // inSec does not appear before in the graph. 292904b3f66Sspupyrev sections.push_back(inSec); 293904b3f66Sspupyrev funcSizes.push_back(inSec->getSize()); 294904b3f66Sspupyrev funcCounts.push_back(0); 295904b3f66Sspupyrev } 296904b3f66Sspupyrev return res.first->second; 297904b3f66Sspupyrev }; 298904b3f66Sspupyrev 299904b3f66Sspupyrev // Create the graph. 3006f482010SFangrui Song for (std::pair<SectionPair, uint64_t> &c : ctx.arg.callGraphProfile) { 301904b3f66Sspupyrev const InputSectionBase *fromSB = cast<InputSectionBase>(c.first.first); 302904b3f66Sspupyrev const InputSectionBase *toSB = cast<InputSectionBase>(c.first.second); 303904b3f66Sspupyrev // Ignore edges between input sections belonging to different sections. 304904b3f66Sspupyrev if (fromSB->getOutputSection() != toSB->getOutputSection()) 305904b3f66Sspupyrev continue; 306904b3f66Sspupyrev 307904b3f66Sspupyrev uint64_t weight = c.second; 308904b3f66Sspupyrev // Ignore edges with zero weight. 309904b3f66Sspupyrev if (weight == 0) 310904b3f66Sspupyrev continue; 311904b3f66Sspupyrev 312904b3f66Sspupyrev size_t from = getOrCreateNode(fromSB); 313904b3f66Sspupyrev size_t to = getOrCreateNode(toSB); 314904b3f66Sspupyrev // Ignore self-edges (recursive calls). 315904b3f66Sspupyrev if (from == to) 316904b3f66Sspupyrev continue; 317904b3f66Sspupyrev 318904b3f66Sspupyrev callCounts.push_back({from, to, weight}); 319904b3f66Sspupyrev // Assume that the jump is at the middle of the input section. The profile 320904b3f66Sspupyrev // data does not contain jump offsets. 321904b3f66Sspupyrev callOffsets.push_back((funcSizes[from] + 1) / 2); 322904b3f66Sspupyrev funcCounts[to] += weight; 323904b3f66Sspupyrev } 324904b3f66Sspupyrev 325904b3f66Sspupyrev // Run the layout algorithm. 326904b3f66Sspupyrev std::vector<uint64_t> sortedSections = codelayout::computeCacheDirectedLayout( 327904b3f66Sspupyrev funcSizes, funcCounts, callCounts, callOffsets); 328904b3f66Sspupyrev 329904b3f66Sspupyrev // Create the final order. 330904b3f66Sspupyrev DenseMap<const InputSectionBase *, int> orderMap; 331*bcc1e584SFangrui Song int curOrder = -sortedSections.size(); 332904b3f66Sspupyrev for (uint64_t secIdx : sortedSections) 333904b3f66Sspupyrev orderMap[sections[secIdx]] = curOrder++; 334904b3f66Sspupyrev 335904b3f66Sspupyrev return orderMap; 336904b3f66Sspupyrev } 337904b3f66Sspupyrev 338bf6e259bSFangrui Song // Sort sections by the profile data provided by --callgraph-profile-file. 339b842725cSMichael J. Spencer // 340b842725cSMichael J. Spencer // This first builds a call graph based on the profile data then merges sections 341904b3f66Sspupyrev // according either to the C³ or Cache-Directed-Sort ordering algorithm. 3421c28f311SFangrui Song DenseMap<const InputSectionBase *, int> 3431c28f311SFangrui Song elf::computeCallGraphProfileOrder(Ctx &ctx) { 3446f482010SFangrui Song if (ctx.arg.callGraphProfileSort == CGProfileSortKind::Cdsort) 3451c28f311SFangrui Song return computeCacheDirectedSortOrder(ctx); 3461c28f311SFangrui Song return CallGraphSort(ctx).run(); 347b842725cSMichael J. Spencer } 348