10b57cec5SDimitry Andric //===- CallGraphSort.cpp --------------------------------------------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric ///
9*5f757f3fSDimitry Andric /// The file is responsible for sorting sections using LLVM call graph profile
10*5f757f3fSDimitry Andric /// data by placing frequently executed code sections together. The goal of the
11*5f757f3fSDimitry Andric /// placement is to improve the runtime performance of the final executable by
12*5f757f3fSDimitry Andric /// arranging code sections so that i-TLB misses and i-cache misses are reduced.
13*5f757f3fSDimitry Andric ///
14*5f757f3fSDimitry Andric /// The algorithm first builds a call graph based on the profile data and then
15*5f757f3fSDimitry Andric /// iteratively merges "chains" (ordered lists) of input sections which will be
16*5f757f3fSDimitry Andric /// laid out as a unit. There are two implementations for deciding how to
17*5f757f3fSDimitry Andric /// merge a pair of chains:
18*5f757f3fSDimitry Andric /// - a simpler one, referred to as Call-Chain Clustering (C^3), that follows
19*5f757f3fSDimitry Andric /// "Optimizing Function Placement for Large-Scale Data-Center Applications"
200b57cec5SDimitry Andric /// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
21*5f757f3fSDimitry Andric /// - a more advanced one, referred to as Cache-Directed-Sort (CDSort), which
22*5f757f3fSDimitry Andric /// typically produces layouts with higher locality, and hence, yields fewer
23*5f757f3fSDimitry Andric /// instruction cache misses on large binaries.
240b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
250b57cec5SDimitry Andric
260b57cec5SDimitry Andric #include "CallGraphSort.h"
2781ad6265SDimitry Andric #include "InputFiles.h"
2881ad6265SDimitry Andric #include "InputSection.h"
290b57cec5SDimitry Andric #include "Symbols.h"
3081ad6265SDimitry Andric #include "llvm/Support/FileSystem.h"
31*5f757f3fSDimitry Andric #include "llvm/Transforms/Utils/CodeLayout.h"
320b57cec5SDimitry Andric
3385868e8aSDimitry Andric #include <numeric>
3485868e8aSDimitry Andric
350b57cec5SDimitry Andric using namespace llvm;
365ffd83dbSDimitry Andric using namespace lld;
375ffd83dbSDimitry Andric using namespace lld::elf;
380b57cec5SDimitry Andric
390b57cec5SDimitry Andric namespace {
400b57cec5SDimitry Andric struct Edge {
410b57cec5SDimitry Andric int from;
420b57cec5SDimitry Andric uint64_t weight;
430b57cec5SDimitry Andric };
440b57cec5SDimitry Andric
450b57cec5SDimitry Andric struct Cluster {
Cluster__anonac38cd190111::Cluster4685868e8aSDimitry Andric Cluster(int sec, size_t s) : next(sec), prev(sec), size(s) {}
470b57cec5SDimitry Andric
getDensity__anonac38cd190111::Cluster480b57cec5SDimitry Andric double getDensity() const {
490b57cec5SDimitry Andric if (size == 0)
500b57cec5SDimitry Andric return 0;
510b57cec5SDimitry Andric return double(weight) / double(size);
520b57cec5SDimitry Andric }
530b57cec5SDimitry Andric
5485868e8aSDimitry Andric int next;
5585868e8aSDimitry Andric int prev;
56e8d8bef9SDimitry Andric uint64_t size;
570b57cec5SDimitry Andric uint64_t weight = 0;
580b57cec5SDimitry Andric uint64_t initialWeight = 0;
590b57cec5SDimitry Andric Edge bestPred = {-1, 0};
600b57cec5SDimitry Andric };
610b57cec5SDimitry Andric
62*5f757f3fSDimitry Andric /// Implementation of the Call-Chain Clustering (C^3). The goal of this
63*5f757f3fSDimitry Andric /// algorithm is to improve runtime performance of the executable by arranging
64*5f757f3fSDimitry Andric /// code sections such that page table and i-cache misses are minimized.
65*5f757f3fSDimitry Andric ///
66*5f757f3fSDimitry Andric /// Definitions:
67*5f757f3fSDimitry Andric /// * Cluster
68*5f757f3fSDimitry Andric /// * An ordered list of input sections which are laid out as a unit. At the
69*5f757f3fSDimitry Andric /// beginning of the algorithm each input section has its own cluster and
70*5f757f3fSDimitry Andric /// the weight of the cluster is the sum of the weight of all incoming
71*5f757f3fSDimitry Andric /// edges.
72*5f757f3fSDimitry Andric /// * Call-Chain Clustering (C³) Heuristic
73*5f757f3fSDimitry Andric /// * Defines when and how clusters are combined. Pick the highest weighted
74*5f757f3fSDimitry Andric /// input section then add it to its most likely predecessor if it wouldn't
75*5f757f3fSDimitry Andric /// penalize it too much.
76*5f757f3fSDimitry Andric /// * Density
77*5f757f3fSDimitry Andric /// * The weight of the cluster divided by the size of the cluster. This is a
78*5f757f3fSDimitry Andric /// proxy for the amount of execution time spent per byte of the cluster.
79*5f757f3fSDimitry Andric ///
80*5f757f3fSDimitry Andric /// It does so given a call graph profile by the following:
81*5f757f3fSDimitry Andric /// * Build a weighted call graph from the call graph profile
82*5f757f3fSDimitry Andric /// * Sort input sections by weight
83*5f757f3fSDimitry Andric /// * For each input section starting with the highest weight
84*5f757f3fSDimitry Andric /// * Find its most likely predecessor cluster
85*5f757f3fSDimitry Andric /// * Check if the combined cluster would be too large, or would have too low
86*5f757f3fSDimitry Andric /// a density.
87*5f757f3fSDimitry Andric /// * If not, then combine the clusters.
88*5f757f3fSDimitry Andric /// * Sort non-empty clusters by density
890b57cec5SDimitry Andric class CallGraphSort {
900b57cec5SDimitry Andric public:
910b57cec5SDimitry Andric CallGraphSort();
920b57cec5SDimitry Andric
930b57cec5SDimitry Andric DenseMap<const InputSectionBase *, int> run();
940b57cec5SDimitry Andric
950b57cec5SDimitry Andric private:
960b57cec5SDimitry Andric std::vector<Cluster> clusters;
970b57cec5SDimitry Andric std::vector<const InputSectionBase *> sections;
980b57cec5SDimitry Andric };
990b57cec5SDimitry Andric
100480093f4SDimitry Andric // Maximum amount the combined cluster density can be worse than the original
1010b57cec5SDimitry Andric // cluster to consider merging.
1020b57cec5SDimitry Andric constexpr int MAX_DENSITY_DEGRADATION = 8;
1030b57cec5SDimitry Andric
1040b57cec5SDimitry Andric // Maximum cluster size in bytes.
1050b57cec5SDimitry Andric constexpr uint64_t MAX_CLUSTER_SIZE = 1024 * 1024;
1060b57cec5SDimitry Andric } // end anonymous namespace
1070b57cec5SDimitry Andric
1080b57cec5SDimitry Andric using SectionPair =
1090b57cec5SDimitry Andric std::pair<const InputSectionBase *, const InputSectionBase *>;
1100b57cec5SDimitry Andric
1110b57cec5SDimitry Andric // Take the edge list in Config->CallGraphProfile, resolve symbol names to
1120b57cec5SDimitry Andric // Symbols, and generate a graph between InputSections with the provided
1130b57cec5SDimitry Andric // weights.
CallGraphSort()1140b57cec5SDimitry Andric CallGraphSort::CallGraphSort() {
1150b57cec5SDimitry Andric MapVector<SectionPair, uint64_t> &profile = config->callGraphProfile;
1160b57cec5SDimitry Andric DenseMap<const InputSectionBase *, int> secToCluster;
1170b57cec5SDimitry Andric
1180b57cec5SDimitry Andric auto getOrCreateNode = [&](const InputSectionBase *isec) -> int {
11985868e8aSDimitry Andric auto res = secToCluster.try_emplace(isec, clusters.size());
1200b57cec5SDimitry Andric if (res.second) {
1210b57cec5SDimitry Andric sections.push_back(isec);
1220b57cec5SDimitry Andric clusters.emplace_back(clusters.size(), isec->getSize());
1230b57cec5SDimitry Andric }
1240b57cec5SDimitry Andric return res.first->second;
1250b57cec5SDimitry Andric };
1260b57cec5SDimitry Andric
1270b57cec5SDimitry Andric // Create the graph.
1280b57cec5SDimitry Andric for (std::pair<SectionPair, uint64_t> &c : profile) {
1290eae32dcSDimitry Andric const auto *fromSB = cast<InputSectionBase>(c.first.first);
1300eae32dcSDimitry Andric const auto *toSB = cast<InputSectionBase>(c.first.second);
1310b57cec5SDimitry Andric uint64_t weight = c.second;
1320b57cec5SDimitry Andric
1330b57cec5SDimitry Andric // Ignore edges between input sections belonging to different output
1340b57cec5SDimitry Andric // sections. This is done because otherwise we would end up with clusters
1350b57cec5SDimitry Andric // containing input sections that can't actually be placed adjacently in the
1360b57cec5SDimitry Andric // output. This messes with the cluster size and density calculations. We
1370b57cec5SDimitry Andric // would also end up moving input sections in other output sections without
1380b57cec5SDimitry Andric // moving them closer to what calls them.
1390b57cec5SDimitry Andric if (fromSB->getOutputSection() != toSB->getOutputSection())
1400b57cec5SDimitry Andric continue;
1410b57cec5SDimitry Andric
1420b57cec5SDimitry Andric int from = getOrCreateNode(fromSB);
1430b57cec5SDimitry Andric int to = getOrCreateNode(toSB);
1440b57cec5SDimitry Andric
1450b57cec5SDimitry Andric clusters[to].weight += weight;
1460b57cec5SDimitry Andric
1470b57cec5SDimitry Andric if (from == to)
1480b57cec5SDimitry Andric continue;
1490b57cec5SDimitry Andric
1500b57cec5SDimitry Andric // Remember the best edge.
1510b57cec5SDimitry Andric Cluster &toC = clusters[to];
1520b57cec5SDimitry Andric if (toC.bestPred.from == -1 || toC.bestPred.weight < weight) {
1530b57cec5SDimitry Andric toC.bestPred.from = from;
1540b57cec5SDimitry Andric toC.bestPred.weight = weight;
1550b57cec5SDimitry Andric }
1560b57cec5SDimitry Andric }
1570b57cec5SDimitry Andric for (Cluster &c : clusters)
1580b57cec5SDimitry Andric c.initialWeight = c.weight;
1590b57cec5SDimitry Andric }
1600b57cec5SDimitry Andric
1610b57cec5SDimitry Andric // It's bad to merge clusters which would degrade the density too much.
isNewDensityBad(Cluster & a,Cluster & b)1620b57cec5SDimitry Andric static bool isNewDensityBad(Cluster &a, Cluster &b) {
1630b57cec5SDimitry Andric double newDensity = double(a.weight + b.weight) / double(a.size + b.size);
1640b57cec5SDimitry Andric return newDensity < a.getDensity() / MAX_DENSITY_DEGRADATION;
1650b57cec5SDimitry Andric }
1660b57cec5SDimitry Andric
16785868e8aSDimitry Andric // Find the leader of V's belonged cluster (represented as an equivalence
16885868e8aSDimitry Andric // class). We apply union-find path-halving technique (simple to implement) in
16985868e8aSDimitry Andric // the meantime as it decreases depths and the time complexity.
getLeader(int * leaders,int v)170bdd1243dSDimitry Andric static int getLeader(int *leaders, int v) {
17185868e8aSDimitry Andric while (leaders[v] != v) {
17285868e8aSDimitry Andric leaders[v] = leaders[leaders[v]];
17385868e8aSDimitry Andric v = leaders[v];
17485868e8aSDimitry Andric }
17585868e8aSDimitry Andric return v;
17685868e8aSDimitry Andric }
17785868e8aSDimitry Andric
mergeClusters(std::vector<Cluster> & cs,Cluster & into,int intoIdx,Cluster & from,int fromIdx)17885868e8aSDimitry Andric static void mergeClusters(std::vector<Cluster> &cs, Cluster &into, int intoIdx,
17985868e8aSDimitry Andric Cluster &from, int fromIdx) {
18085868e8aSDimitry Andric int tail1 = into.prev, tail2 = from.prev;
18185868e8aSDimitry Andric into.prev = tail2;
18285868e8aSDimitry Andric cs[tail2].next = intoIdx;
18385868e8aSDimitry Andric from.prev = tail1;
18485868e8aSDimitry Andric cs[tail1].next = fromIdx;
1850b57cec5SDimitry Andric into.size += from.size;
1860b57cec5SDimitry Andric into.weight += from.weight;
1870b57cec5SDimitry Andric from.size = 0;
1880b57cec5SDimitry Andric from.weight = 0;
1890b57cec5SDimitry Andric }
1900b57cec5SDimitry Andric
1910b57cec5SDimitry Andric // Group InputSections into clusters using the Call-Chain Clustering heuristic
1920b57cec5SDimitry Andric // then sort the clusters by density.
run()19385868e8aSDimitry Andric DenseMap<const InputSectionBase *, int> CallGraphSort::run() {
19485868e8aSDimitry Andric std::vector<int> sorted(clusters.size());
195bdd1243dSDimitry Andric std::unique_ptr<int[]> leaders(new int[clusters.size()]);
1960b57cec5SDimitry Andric
197bdd1243dSDimitry Andric std::iota(leaders.get(), leaders.get() + clusters.size(), 0);
19885868e8aSDimitry Andric std::iota(sorted.begin(), sorted.end(), 0);
19985868e8aSDimitry Andric llvm::stable_sort(sorted, [&](int a, int b) {
2000b57cec5SDimitry Andric return clusters[a].getDensity() > clusters[b].getDensity();
2010b57cec5SDimitry Andric });
2020b57cec5SDimitry Andric
20385868e8aSDimitry Andric for (int l : sorted) {
20485868e8aSDimitry Andric // The cluster index is the same as the index of its leader here because
20585868e8aSDimitry Andric // clusters[L] has not been merged into another cluster yet.
20685868e8aSDimitry Andric Cluster &c = clusters[l];
2070b57cec5SDimitry Andric
2080b57cec5SDimitry Andric // Don't consider merging if the edge is unlikely.
2090b57cec5SDimitry Andric if (c.bestPred.from == -1 || c.bestPred.weight * 10 <= c.initialWeight)
2100b57cec5SDimitry Andric continue;
2110b57cec5SDimitry Andric
212bdd1243dSDimitry Andric int predL = getLeader(leaders.get(), c.bestPred.from);
21385868e8aSDimitry Andric if (l == predL)
2140b57cec5SDimitry Andric continue;
2150b57cec5SDimitry Andric
21685868e8aSDimitry Andric Cluster *predC = &clusters[predL];
2170b57cec5SDimitry Andric if (c.size + predC->size > MAX_CLUSTER_SIZE)
2180b57cec5SDimitry Andric continue;
2190b57cec5SDimitry Andric
2200b57cec5SDimitry Andric if (isNewDensityBad(*predC, c))
2210b57cec5SDimitry Andric continue;
2220b57cec5SDimitry Andric
22385868e8aSDimitry Andric leaders[l] = predL;
22485868e8aSDimitry Andric mergeClusters(clusters, *predC, predL, c, l);
2250b57cec5SDimitry Andric }
2260b57cec5SDimitry Andric
22785868e8aSDimitry Andric // Sort remaining non-empty clusters by density.
22885868e8aSDimitry Andric sorted.clear();
22985868e8aSDimitry Andric for (int i = 0, e = (int)clusters.size(); i != e; ++i)
23085868e8aSDimitry Andric if (clusters[i].size > 0)
23185868e8aSDimitry Andric sorted.push_back(i);
23285868e8aSDimitry Andric llvm::stable_sort(sorted, [&](int a, int b) {
23385868e8aSDimitry Andric return clusters[a].getDensity() > clusters[b].getDensity();
2340b57cec5SDimitry Andric });
2350b57cec5SDimitry Andric
2360b57cec5SDimitry Andric DenseMap<const InputSectionBase *, int> orderMap;
23785868e8aSDimitry Andric int curOrder = 1;
238e8d8bef9SDimitry Andric for (int leader : sorted) {
23985868e8aSDimitry Andric for (int i = leader;;) {
24085868e8aSDimitry Andric orderMap[sections[i]] = curOrder++;
24185868e8aSDimitry Andric i = clusters[i].next;
24285868e8aSDimitry Andric if (i == leader)
24385868e8aSDimitry Andric break;
24485868e8aSDimitry Andric }
245e8d8bef9SDimitry Andric }
2460b57cec5SDimitry Andric if (!config->printSymbolOrder.empty()) {
2470b57cec5SDimitry Andric std::error_code ec;
24885868e8aSDimitry Andric raw_fd_ostream os(config->printSymbolOrder, ec, sys::fs::OF_None);
2490b57cec5SDimitry Andric if (ec) {
2500b57cec5SDimitry Andric error("cannot open " + config->printSymbolOrder + ": " + ec.message());
2510b57cec5SDimitry Andric return orderMap;
2520b57cec5SDimitry Andric }
2530b57cec5SDimitry Andric
2540b57cec5SDimitry Andric // Print the symbols ordered by C3, in the order of increasing curOrder
2550b57cec5SDimitry Andric // Instead of sorting all the orderMap, just repeat the loops above.
25685868e8aSDimitry Andric for (int leader : sorted)
25785868e8aSDimitry Andric for (int i = leader;;) {
2580b57cec5SDimitry Andric // Search all the symbols in the file of the section
2590b57cec5SDimitry Andric // and find out a Defined symbol with name that is within the section.
26085868e8aSDimitry Andric for (Symbol *sym : sections[i]->file->getSymbols())
2610b57cec5SDimitry Andric if (!sym->isSection()) // Filter out section-type symbols here.
2620b57cec5SDimitry Andric if (auto *d = dyn_cast<Defined>(sym))
26385868e8aSDimitry Andric if (sections[i] == d->section)
2640b57cec5SDimitry Andric os << sym->getName() << "\n";
26585868e8aSDimitry Andric i = clusters[i].next;
26685868e8aSDimitry Andric if (i == leader)
26785868e8aSDimitry Andric break;
26885868e8aSDimitry Andric }
2690b57cec5SDimitry Andric }
2700b57cec5SDimitry Andric
2710b57cec5SDimitry Andric return orderMap;
2720b57cec5SDimitry Andric }
2730b57cec5SDimitry Andric
274*5f757f3fSDimitry Andric // Sort sections by the profile data using the Cache-Directed Sort algorithm.
275*5f757f3fSDimitry Andric // The placement is done by optimizing the locality by co-locating frequently
276*5f757f3fSDimitry Andric // executed code sections together.
computeCacheDirectedSortOrder()277*5f757f3fSDimitry Andric DenseMap<const InputSectionBase *, int> elf::computeCacheDirectedSortOrder() {
278*5f757f3fSDimitry Andric SmallVector<uint64_t, 0> funcSizes;
279*5f757f3fSDimitry Andric SmallVector<uint64_t, 0> funcCounts;
280*5f757f3fSDimitry Andric SmallVector<codelayout::EdgeCount, 0> callCounts;
281*5f757f3fSDimitry Andric SmallVector<uint64_t, 0> callOffsets;
282*5f757f3fSDimitry Andric SmallVector<const InputSectionBase *, 0> sections;
283*5f757f3fSDimitry Andric DenseMap<const InputSectionBase *, size_t> secToTargetId;
284*5f757f3fSDimitry Andric
285*5f757f3fSDimitry Andric auto getOrCreateNode = [&](const InputSectionBase *inSec) -> size_t {
286*5f757f3fSDimitry Andric auto res = secToTargetId.try_emplace(inSec, sections.size());
287*5f757f3fSDimitry Andric if (res.second) {
288*5f757f3fSDimitry Andric // inSec does not appear before in the graph.
289*5f757f3fSDimitry Andric sections.push_back(inSec);
290*5f757f3fSDimitry Andric funcSizes.push_back(inSec->getSize());
291*5f757f3fSDimitry Andric funcCounts.push_back(0);
292*5f757f3fSDimitry Andric }
293*5f757f3fSDimitry Andric return res.first->second;
294*5f757f3fSDimitry Andric };
295*5f757f3fSDimitry Andric
296*5f757f3fSDimitry Andric // Create the graph.
297*5f757f3fSDimitry Andric for (std::pair<SectionPair, uint64_t> &c : config->callGraphProfile) {
298*5f757f3fSDimitry Andric const InputSectionBase *fromSB = cast<InputSectionBase>(c.first.first);
299*5f757f3fSDimitry Andric const InputSectionBase *toSB = cast<InputSectionBase>(c.first.second);
300*5f757f3fSDimitry Andric // Ignore edges between input sections belonging to different sections.
301*5f757f3fSDimitry Andric if (fromSB->getOutputSection() != toSB->getOutputSection())
302*5f757f3fSDimitry Andric continue;
303*5f757f3fSDimitry Andric
304*5f757f3fSDimitry Andric uint64_t weight = c.second;
305*5f757f3fSDimitry Andric // Ignore edges with zero weight.
306*5f757f3fSDimitry Andric if (weight == 0)
307*5f757f3fSDimitry Andric continue;
308*5f757f3fSDimitry Andric
309*5f757f3fSDimitry Andric size_t from = getOrCreateNode(fromSB);
310*5f757f3fSDimitry Andric size_t to = getOrCreateNode(toSB);
311*5f757f3fSDimitry Andric // Ignore self-edges (recursive calls).
312*5f757f3fSDimitry Andric if (from == to)
313*5f757f3fSDimitry Andric continue;
314*5f757f3fSDimitry Andric
315*5f757f3fSDimitry Andric callCounts.push_back({from, to, weight});
316*5f757f3fSDimitry Andric // Assume that the jump is at the middle of the input section. The profile
317*5f757f3fSDimitry Andric // data does not contain jump offsets.
318*5f757f3fSDimitry Andric callOffsets.push_back((funcSizes[from] + 1) / 2);
319*5f757f3fSDimitry Andric funcCounts[to] += weight;
320*5f757f3fSDimitry Andric }
321*5f757f3fSDimitry Andric
322*5f757f3fSDimitry Andric // Run the layout algorithm.
323*5f757f3fSDimitry Andric std::vector<uint64_t> sortedSections = codelayout::computeCacheDirectedLayout(
324*5f757f3fSDimitry Andric funcSizes, funcCounts, callCounts, callOffsets);
325*5f757f3fSDimitry Andric
326*5f757f3fSDimitry Andric // Create the final order.
327*5f757f3fSDimitry Andric DenseMap<const InputSectionBase *, int> orderMap;
328*5f757f3fSDimitry Andric int curOrder = 1;
329*5f757f3fSDimitry Andric for (uint64_t secIdx : sortedSections)
330*5f757f3fSDimitry Andric orderMap[sections[secIdx]] = curOrder++;
331*5f757f3fSDimitry Andric
332*5f757f3fSDimitry Andric return orderMap;
333*5f757f3fSDimitry Andric }
334*5f757f3fSDimitry Andric
335349cc55cSDimitry Andric // Sort sections by the profile data provided by --callgraph-profile-file.
3360b57cec5SDimitry Andric //
3370b57cec5SDimitry Andric // This first builds a call graph based on the profile data then merges sections
338*5f757f3fSDimitry Andric // according either to the C³ or Cache-Directed-Sort ordering algorithm.
computeCallGraphProfileOrder()3395ffd83dbSDimitry Andric DenseMap<const InputSectionBase *, int> elf::computeCallGraphProfileOrder() {
340*5f757f3fSDimitry Andric if (config->callGraphProfileSort == CGProfileSortKind::Cdsort)
341*5f757f3fSDimitry Andric return computeCacheDirectedSortOrder();
3420b57cec5SDimitry Andric return CallGraphSort().run();
3430b57cec5SDimitry Andric }
344