1 //===- CallGraphSort.cpp --------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// The file is responsible for sorting sections using LLVM call graph profile 10 /// data by placing frequently executed code sections together. The goal of the 11 /// placement is to improve the runtime performance of the final executable by 12 /// arranging code sections so that i-TLB misses and i-cache misses are reduced. 13 /// 14 /// The algorithm first builds a call graph based on the profile data and then 15 /// iteratively merges "chains" (ordered lists) of input sections which will be 16 /// laid out as a unit. There are two implementations for deciding how to 17 /// merge a pair of chains: 18 /// - a simpler one, referred to as Call-Chain Clustering (C^3), that follows 19 /// "Optimizing Function Placement for Large-Scale Data-Center Applications" 20 /// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf 21 /// - a more advanced one, referred to as Cache-Directed-Sort (CDSort), which 22 /// typically produces layouts with higher locality, and hence, yields fewer 23 /// instruction cache misses on large binaries. 24 //===----------------------------------------------------------------------===// 25 26 #include "CallGraphSort.h" 27 #include "InputFiles.h" 28 #include "InputSection.h" 29 #include "Symbols.h" 30 #include "llvm/Support/FileSystem.h" 31 #include "llvm/Transforms/Utils/CodeLayout.h" 32 33 #include <numeric> 34 35 using namespace llvm; 36 using namespace lld; 37 using namespace lld::elf; 38 39 namespace { 40 struct Edge { 41 int from; 42 uint64_t weight; 43 }; 44 45 struct Cluster { 46 Cluster(int sec, size_t s) : next(sec), prev(sec), size(s) {} 47 48 double getDensity() const { 49 if (size == 0) 50 return 0; 51 return double(weight) / double(size); 52 } 53 54 int next; 55 int prev; 56 uint64_t size; 57 uint64_t weight = 0; 58 uint64_t initialWeight = 0; 59 Edge bestPred = {-1, 0}; 60 }; 61 62 /// Implementation of the Call-Chain Clustering (C^3). The goal of this 63 /// algorithm is to improve runtime performance of the executable by arranging 64 /// code sections such that page table and i-cache misses are minimized. 65 /// 66 /// Definitions: 67 /// * Cluster 68 /// * An ordered list of input sections which are laid out as a unit. At the 69 /// beginning of the algorithm each input section has its own cluster and 70 /// the weight of the cluster is the sum of the weight of all incoming 71 /// edges. 72 /// * Call-Chain Clustering (C³) Heuristic 73 /// * Defines when and how clusters are combined. Pick the highest weighted 74 /// input section then add it to its most likely predecessor if it wouldn't 75 /// penalize it too much. 76 /// * Density 77 /// * The weight of the cluster divided by the size of the cluster. This is a 78 /// proxy for the amount of execution time spent per byte of the cluster. 79 /// 80 /// It does so given a call graph profile by the following: 81 /// * Build a weighted call graph from the call graph profile 82 /// * Sort input sections by weight 83 /// * For each input section starting with the highest weight 84 /// * Find its most likely predecessor cluster 85 /// * Check if the combined cluster would be too large, or would have too low 86 /// a density. 87 /// * If not, then combine the clusters. 88 /// * Sort non-empty clusters by density 89 class CallGraphSort { 90 public: 91 CallGraphSort(Ctx &); 92 93 DenseMap<const InputSectionBase *, int> run(); 94 95 private: 96 Ctx &ctx; 97 std::vector<Cluster> clusters; 98 std::vector<const InputSectionBase *> sections; 99 }; 100 101 // Maximum amount the combined cluster density can be worse than the original 102 // cluster to consider merging. 103 constexpr int MAX_DENSITY_DEGRADATION = 8; 104 105 // Maximum cluster size in bytes. 106 constexpr uint64_t MAX_CLUSTER_SIZE = 1024 * 1024; 107 } // end anonymous namespace 108 109 using SectionPair = 110 std::pair<const InputSectionBase *, const InputSectionBase *>; 111 112 // Take the edge list in ctx.arg.callGraphProfile, resolve symbol names to 113 // Symbols, and generate a graph between InputSections with the provided 114 // weights. 115 CallGraphSort::CallGraphSort(Ctx &ctx) : ctx(ctx) { 116 MapVector<SectionPair, uint64_t> &profile = ctx.arg.callGraphProfile; 117 DenseMap<const InputSectionBase *, int> secToCluster; 118 119 auto getOrCreateNode = [&](const InputSectionBase *isec) -> int { 120 auto res = secToCluster.try_emplace(isec, clusters.size()); 121 if (res.second) { 122 sections.push_back(isec); 123 clusters.emplace_back(clusters.size(), isec->getSize()); 124 } 125 return res.first->second; 126 }; 127 128 // Create the graph. 129 for (std::pair<SectionPair, uint64_t> &c : profile) { 130 const auto *fromSB = cast<InputSectionBase>(c.first.first); 131 const auto *toSB = cast<InputSectionBase>(c.first.second); 132 uint64_t weight = c.second; 133 134 // Ignore edges between input sections belonging to different output 135 // sections. This is done because otherwise we would end up with clusters 136 // containing input sections that can't actually be placed adjacently in the 137 // output. This messes with the cluster size and density calculations. We 138 // would also end up moving input sections in other output sections without 139 // moving them closer to what calls them. 140 if (fromSB->getOutputSection() != toSB->getOutputSection()) 141 continue; 142 143 int from = getOrCreateNode(fromSB); 144 int to = getOrCreateNode(toSB); 145 146 clusters[to].weight += weight; 147 148 if (from == to) 149 continue; 150 151 // Remember the best edge. 152 Cluster &toC = clusters[to]; 153 if (toC.bestPred.from == -1 || toC.bestPred.weight < weight) { 154 toC.bestPred.from = from; 155 toC.bestPred.weight = weight; 156 } 157 } 158 for (Cluster &c : clusters) 159 c.initialWeight = c.weight; 160 } 161 162 // It's bad to merge clusters which would degrade the density too much. 163 static bool isNewDensityBad(Cluster &a, Cluster &b) { 164 double newDensity = double(a.weight + b.weight) / double(a.size + b.size); 165 return newDensity < a.getDensity() / MAX_DENSITY_DEGRADATION; 166 } 167 168 // Find the leader of V's belonged cluster (represented as an equivalence 169 // class). We apply union-find path-halving technique (simple to implement) in 170 // the meantime as it decreases depths and the time complexity. 171 static int getLeader(int *leaders, int v) { 172 while (leaders[v] != v) { 173 leaders[v] = leaders[leaders[v]]; 174 v = leaders[v]; 175 } 176 return v; 177 } 178 179 static void mergeClusters(std::vector<Cluster> &cs, Cluster &into, int intoIdx, 180 Cluster &from, int fromIdx) { 181 int tail1 = into.prev, tail2 = from.prev; 182 into.prev = tail2; 183 cs[tail2].next = intoIdx; 184 from.prev = tail1; 185 cs[tail1].next = fromIdx; 186 into.size += from.size; 187 into.weight += from.weight; 188 from.size = 0; 189 from.weight = 0; 190 } 191 192 // Group InputSections into clusters using the Call-Chain Clustering heuristic 193 // then sort the clusters by density. 194 DenseMap<const InputSectionBase *, int> CallGraphSort::run() { 195 std::vector<int> sorted(clusters.size()); 196 std::unique_ptr<int[]> leaders(new int[clusters.size()]); 197 198 std::iota(leaders.get(), leaders.get() + clusters.size(), 0); 199 std::iota(sorted.begin(), sorted.end(), 0); 200 llvm::stable_sort(sorted, [&](int a, int b) { 201 return clusters[a].getDensity() > clusters[b].getDensity(); 202 }); 203 204 for (int l : sorted) { 205 // The cluster index is the same as the index of its leader here because 206 // clusters[L] has not been merged into another cluster yet. 207 Cluster &c = clusters[l]; 208 209 // Don't consider merging if the edge is unlikely. 210 if (c.bestPred.from == -1 || c.bestPred.weight * 10 <= c.initialWeight) 211 continue; 212 213 int predL = getLeader(leaders.get(), c.bestPred.from); 214 if (l == predL) 215 continue; 216 217 Cluster *predC = &clusters[predL]; 218 if (c.size + predC->size > MAX_CLUSTER_SIZE) 219 continue; 220 221 if (isNewDensityBad(*predC, c)) 222 continue; 223 224 leaders[l] = predL; 225 mergeClusters(clusters, *predC, predL, c, l); 226 } 227 228 // Sort remaining non-empty clusters by density. 229 sorted.clear(); 230 for (int i = 0, e = (int)clusters.size(); i != e; ++i) 231 if (clusters[i].size > 0) 232 sorted.push_back(i); 233 llvm::stable_sort(sorted, [&](int a, int b) { 234 return clusters[a].getDensity() > clusters[b].getDensity(); 235 }); 236 237 DenseMap<const InputSectionBase *, int> orderMap; 238 int curOrder = -clusters.size(); 239 for (int leader : sorted) { 240 for (int i = leader;;) { 241 orderMap[sections[i]] = curOrder++; 242 i = clusters[i].next; 243 if (i == leader) 244 break; 245 } 246 } 247 if (!ctx.arg.printSymbolOrder.empty()) { 248 std::error_code ec; 249 raw_fd_ostream os(ctx.arg.printSymbolOrder, ec, sys::fs::OF_None); 250 if (ec) { 251 ErrAlways(ctx) << "cannot open " << ctx.arg.printSymbolOrder << ": " 252 << ec.message(); 253 return orderMap; 254 } 255 256 // Print the symbols ordered by C3, in the order of increasing curOrder 257 // Instead of sorting all the orderMap, just repeat the loops above. 258 for (int leader : sorted) 259 for (int i = leader;;) { 260 // Search all the symbols in the file of the section 261 // and find out a Defined symbol with name that is within the section. 262 for (Symbol *sym : sections[i]->file->getSymbols()) 263 if (!sym->isSection()) // Filter out section-type symbols here. 264 if (auto *d = dyn_cast<Defined>(sym)) 265 if (sections[i] == d->section) 266 os << sym->getName() << "\n"; 267 i = clusters[i].next; 268 if (i == leader) 269 break; 270 } 271 } 272 273 return orderMap; 274 } 275 276 // Sort sections by the profile data using the Cache-Directed Sort algorithm. 277 // The placement is done by optimizing the locality by co-locating frequently 278 // executed code sections together. 279 static DenseMap<const InputSectionBase *, int> 280 computeCacheDirectedSortOrder(Ctx &ctx) { 281 SmallVector<uint64_t, 0> funcSizes; 282 SmallVector<uint64_t, 0> funcCounts; 283 SmallVector<codelayout::EdgeCount, 0> callCounts; 284 SmallVector<uint64_t, 0> callOffsets; 285 SmallVector<const InputSectionBase *, 0> sections; 286 DenseMap<const InputSectionBase *, size_t> secToTargetId; 287 288 auto getOrCreateNode = [&](const InputSectionBase *inSec) -> size_t { 289 auto res = secToTargetId.try_emplace(inSec, sections.size()); 290 if (res.second) { 291 // inSec does not appear before in the graph. 292 sections.push_back(inSec); 293 funcSizes.push_back(inSec->getSize()); 294 funcCounts.push_back(0); 295 } 296 return res.first->second; 297 }; 298 299 // Create the graph. 300 for (std::pair<SectionPair, uint64_t> &c : ctx.arg.callGraphProfile) { 301 const InputSectionBase *fromSB = cast<InputSectionBase>(c.first.first); 302 const InputSectionBase *toSB = cast<InputSectionBase>(c.first.second); 303 // Ignore edges between input sections belonging to different sections. 304 if (fromSB->getOutputSection() != toSB->getOutputSection()) 305 continue; 306 307 uint64_t weight = c.second; 308 // Ignore edges with zero weight. 309 if (weight == 0) 310 continue; 311 312 size_t from = getOrCreateNode(fromSB); 313 size_t to = getOrCreateNode(toSB); 314 // Ignore self-edges (recursive calls). 315 if (from == to) 316 continue; 317 318 callCounts.push_back({from, to, weight}); 319 // Assume that the jump is at the middle of the input section. The profile 320 // data does not contain jump offsets. 321 callOffsets.push_back((funcSizes[from] + 1) / 2); 322 funcCounts[to] += weight; 323 } 324 325 // Run the layout algorithm. 326 std::vector<uint64_t> sortedSections = codelayout::computeCacheDirectedLayout( 327 funcSizes, funcCounts, callCounts, callOffsets); 328 329 // Create the final order. 330 DenseMap<const InputSectionBase *, int> orderMap; 331 int curOrder = -sortedSections.size(); 332 for (uint64_t secIdx : sortedSections) 333 orderMap[sections[secIdx]] = curOrder++; 334 335 return orderMap; 336 } 337 338 // Sort sections by the profile data provided by --callgraph-profile-file. 339 // 340 // This first builds a call graph based on the profile data then merges sections 341 // according either to the C³ or Cache-Directed-Sort ordering algorithm. 342 DenseMap<const InputSectionBase *, int> 343 elf::computeCallGraphProfileOrder(Ctx &ctx) { 344 if (ctx.arg.callGraphProfileSort == CGProfileSortKind::Cdsort) 345 return computeCacheDirectedSortOrder(ctx); 346 return CallGraphSort(ctx).run(); 347 } 348