xref: /llvm-project/bolt/lib/Passes/CacheMetrics.cpp (revision c8fc234ee28d0e2a10bd88bae391cb3e32e2ee77)
1 //===- bolt/Passes/CacheMetrics.cpp - Metrics for instruction cache -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the CacheMetrics class and functions for showing metrics
10 // of cache lines.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "bolt/Passes/CacheMetrics.h"
15 #include "bolt/Core/BinaryBasicBlock.h"
16 #include "bolt/Core/BinaryFunction.h"
17 #include <unordered_map>
18 
19 using namespace llvm;
20 using namespace bolt;
21 
22 namespace {
23 
24 /// The following constants are used to estimate the number of i-TLB cache
25 /// misses for a given code layout. Empirically the values result in high
26 /// correlations between the estimations and the perf measurements.
27 /// The constants do not affect the code layout algorithms.
28 constexpr unsigned ITLBPageSize = 4096;
29 constexpr unsigned ITLBEntries = 16;
30 
31 /// Initialize and return a position map for binary basic blocks
extractBasicBlockInfo(const std::vector<BinaryFunction * > & BinaryFunctions,std::unordered_map<BinaryBasicBlock *,uint64_t> & BBAddr,std::unordered_map<BinaryBasicBlock *,uint64_t> & BBSize)32 void extractBasicBlockInfo(
33     const std::vector<BinaryFunction *> &BinaryFunctions,
34     std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
35     std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
36 
37   for (BinaryFunction *BF : BinaryFunctions) {
38     const BinaryContext &BC = BF->getBinaryContext();
39     for (BinaryBasicBlock &BB : *BF) {
40       if (BF->isSimple() || BC.HasRelocations) {
41         // Use addresses/sizes as in the output binary
42         BBAddr[&BB] = BB.getOutputAddressRange().first;
43         BBSize[&BB] = BB.getOutputSize();
44       } else {
45         // Output ranges should match the input if the body hasn't changed
46         BBAddr[&BB] = BB.getInputAddressRange().first + BF->getAddress();
47         BBSize[&BB] = BB.getOriginalSize();
48       }
49     }
50   }
51 }
52 
53 /// Calculate TSP metric, which quantifies the number of fallthrough jumps in
54 /// the ordering of basic blocks. The method returns a pair
55 /// (the number of fallthrough branches, the total number of branches)
56 std::pair<uint64_t, uint64_t>
calcTSPScore(const std::vector<BinaryFunction * > & BinaryFunctions,const std::unordered_map<BinaryBasicBlock *,uint64_t> & BBAddr,const std::unordered_map<BinaryBasicBlock *,uint64_t> & BBSize)57 calcTSPScore(const std::vector<BinaryFunction *> &BinaryFunctions,
58              const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
59              const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
60   uint64_t Score = 0;
61   uint64_t JumpCount = 0;
62   for (BinaryFunction *BF : BinaryFunctions) {
63     if (!BF->hasProfile())
64       continue;
65     for (BinaryBasicBlock *SrcBB : BF->getLayout().blocks()) {
66       auto BI = SrcBB->branch_info_begin();
67       for (BinaryBasicBlock *DstBB : SrcBB->successors()) {
68         if (SrcBB != DstBB && BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) {
69           JumpCount += BI->Count;
70 
71           auto BBAddrIt = BBAddr.find(SrcBB);
72           assert(BBAddrIt != BBAddr.end());
73           uint64_t SrcBBAddr = BBAddrIt->second;
74 
75           auto BBSizeIt = BBSize.find(SrcBB);
76           assert(BBSizeIt != BBSize.end());
77           uint64_t SrcBBSize = BBSizeIt->second;
78 
79           BBAddrIt = BBAddr.find(DstBB);
80           assert(BBAddrIt != BBAddr.end());
81           uint64_t DstBBAddr = BBAddrIt->second;
82 
83           if (SrcBBAddr + SrcBBSize == DstBBAddr)
84             Score += BI->Count;
85         }
86         ++BI;
87       }
88     }
89   }
90   return std::make_pair(Score, JumpCount);
91 }
92 
93 using Predecessors = std::vector<std::pair<BinaryFunction *, uint64_t>>;
94 
95 /// Build a simplified version of the call graph: For every function, keep
96 /// its callers and the frequencies of the calls
97 std::unordered_map<const BinaryFunction *, Predecessors>
extractFunctionCalls(const std::vector<BinaryFunction * > & BinaryFunctions)98 extractFunctionCalls(const std::vector<BinaryFunction *> &BinaryFunctions) {
99   std::unordered_map<const BinaryFunction *, Predecessors> Calls;
100 
101   for (BinaryFunction *SrcFunction : BinaryFunctions) {
102     const BinaryContext &BC = SrcFunction->getBinaryContext();
103     for (const BinaryBasicBlock *BB : SrcFunction->getLayout().blocks()) {
104       // Find call instructions and extract target symbols from each one
105       for (const MCInst &Inst : *BB) {
106         if (!BC.MIB->isCall(Inst))
107           continue;
108 
109         // Call info
110         const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst);
111         uint64_t Count = BB->getKnownExecutionCount();
112         // Ignore calls w/o information
113         if (DstSym == nullptr || Count == 0)
114           continue;
115 
116         const BinaryFunction *DstFunction = BC.getFunctionForSymbol(DstSym);
117         // Ignore recursive calls
118         if (DstFunction == nullptr || DstFunction->getLayout().block_empty() ||
119             DstFunction == SrcFunction)
120           continue;
121 
122         // Record the call
123         Calls[DstFunction].emplace_back(SrcFunction, Count);
124       }
125     }
126   }
127   return Calls;
128 }
129 
130 /// Compute expected hit ratio of the i-TLB cache (optimized by HFSortPlus alg).
131 /// Given an assignment of functions to the i-TLB pages), we divide all
132 /// functions calls into two categories:
133 /// - 'short' ones that have a caller-callee distance less than a page;
134 /// - 'long' ones where the distance exceeds a page.
135 /// The short calls are likely to result in a i-TLB cache hit. For the long
136 /// ones, the hit/miss result depends on the 'hotness' of the page (i.e., how
137 /// often the page is accessed). Assuming that functions are sent to the i-TLB
138 /// cache in a random order, the probability that a page is present in the cache
139 /// is proportional to the number of samples corresponding to the functions on
140 /// the page. The following procedure detects short and long calls, and
141 /// estimates the expected number of cache misses for the long ones.
expectedCacheHitRatio(const std::vector<BinaryFunction * > & BinaryFunctions,const std::unordered_map<BinaryBasicBlock *,uint64_t> & BBAddr,const std::unordered_map<BinaryBasicBlock *,uint64_t> & BBSize)142 double expectedCacheHitRatio(
143     const std::vector<BinaryFunction *> &BinaryFunctions,
144     const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
145     const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
146   std::unordered_map<const BinaryFunction *, Predecessors> Calls =
147       extractFunctionCalls(BinaryFunctions);
148   // Compute 'hotness' of the functions
149   double TotalSamples = 0;
150   std::unordered_map<BinaryFunction *, double> FunctionSamples;
151   for (BinaryFunction *BF : BinaryFunctions) {
152     double Samples = 0;
153     for (std::pair<BinaryFunction *, uint64_t> Pair : Calls[BF])
154       Samples += Pair.second;
155     Samples = std::max(Samples, (double)BF->getKnownExecutionCount());
156     FunctionSamples[BF] = Samples;
157     TotalSamples += Samples;
158   }
159 
160   // Compute 'hotness' of the pages
161   std::unordered_map<uint64_t, double> PageSamples;
162   for (BinaryFunction *BF : BinaryFunctions) {
163     if (BF->getLayout().block_empty())
164       continue;
165     auto BBAddrIt = BBAddr.find(BF->getLayout().block_front());
166     assert(BBAddrIt != BBAddr.end());
167     const uint64_t Page = BBAddrIt->second / ITLBPageSize;
168 
169     auto FunctionSamplesIt = FunctionSamples.find(BF);
170     assert(FunctionSamplesIt != FunctionSamples.end());
171     PageSamples[Page] += FunctionSamplesIt->second;
172   }
173 
174   // Computing the expected number of misses for every function
175   double Misses = 0;
176   for (BinaryFunction *BF : BinaryFunctions) {
177     // Skip the function if it has no samples
178     auto FunctionSamplesIt = FunctionSamples.find(BF);
179     assert(FunctionSamplesIt != FunctionSamples.end());
180     double Samples = FunctionSamplesIt->second;
181     if (BF->getLayout().block_empty() || Samples == 0.0)
182       continue;
183 
184     auto BBAddrIt = BBAddr.find(BF->getLayout().block_front());
185     assert(BBAddrIt != BBAddr.end());
186     const uint64_t Page = BBAddrIt->second / ITLBPageSize;
187     // The probability that the page is not present in the cache
188     const double MissProb =
189         pow(1.0 - PageSamples[Page] / TotalSamples, ITLBEntries);
190 
191     // Processing all callers of the function
192     for (std::pair<BinaryFunction *, uint64_t> Pair : Calls[BF]) {
193       BinaryFunction *SrcFunction = Pair.first;
194 
195       BBAddrIt = BBAddr.find(SrcFunction->getLayout().block_front());
196       assert(BBAddrIt != BBAddr.end());
197       const uint64_t SrcPage = BBAddrIt->second / ITLBPageSize;
198       // Is this a 'long' or a 'short' call?
199       if (Page != SrcPage) {
200         // This is a miss
201         Misses += MissProb * Pair.second;
202       }
203       Samples -= Pair.second;
204     }
205     assert(Samples >= 0.0 && "Function samples computed incorrectly");
206     // The remaining samples likely come from the jitted code
207     Misses += Samples * MissProb;
208   }
209 
210   return 100.0 * (1.0 - Misses / TotalSamples);
211 }
212 
213 } // namespace
214 
printAll(raw_ostream & OS,const std::vector<BinaryFunction * > & BFs)215 void CacheMetrics::printAll(raw_ostream &OS,
216                             const std::vector<BinaryFunction *> &BFs) {
217   // Stats related to hot-cold code splitting
218   size_t NumFunctions = 0;
219   size_t NumProfiledFunctions = 0;
220   size_t NumHotFunctions = 0;
221   size_t NumBlocks = 0;
222   size_t NumHotBlocks = 0;
223 
224   size_t TotalCodeMinAddr = std::numeric_limits<size_t>::max();
225   size_t TotalCodeMaxAddr = 0;
226   size_t HotCodeMinAddr = std::numeric_limits<size_t>::max();
227   size_t HotCodeMaxAddr = 0;
228 
229   for (BinaryFunction *BF : BFs) {
230     NumFunctions++;
231     if (BF->hasProfile())
232       NumProfiledFunctions++;
233     if (BF->hasValidIndex())
234       NumHotFunctions++;
235     for (const BinaryBasicBlock &BB : *BF) {
236       NumBlocks++;
237       size_t BBAddrMin = BB.getOutputAddressRange().first;
238       size_t BBAddrMax = BB.getOutputAddressRange().second;
239       TotalCodeMinAddr = std::min(TotalCodeMinAddr, BBAddrMin);
240       TotalCodeMaxAddr = std::max(TotalCodeMaxAddr, BBAddrMax);
241       if (BF->hasValidIndex() && !BB.isCold()) {
242         NumHotBlocks++;
243         HotCodeMinAddr = std::min(HotCodeMinAddr, BBAddrMin);
244         HotCodeMaxAddr = std::max(HotCodeMaxAddr, BBAddrMax);
245       }
246     }
247   }
248 
249   OS << format("  There are %zu functions;", NumFunctions)
250      << format(" %zu (%.2lf%%) are in the hot section,", NumHotFunctions,
251                100.0 * NumHotFunctions / NumFunctions)
252      << format(" %zu (%.2lf%%) have profile\n", NumProfiledFunctions,
253                100.0 * NumProfiledFunctions / NumFunctions);
254   OS << format("  There are %zu basic blocks;", NumBlocks)
255      << format(" %zu (%.2lf%%) are in the hot section\n", NumHotBlocks,
256                100.0 * NumHotBlocks / NumBlocks);
257 
258   assert(TotalCodeMinAddr <= TotalCodeMaxAddr && "incorrect output addresses");
259   size_t HotCodeSize = HotCodeMaxAddr - HotCodeMinAddr;
260   size_t TotalCodeSize = TotalCodeMaxAddr - TotalCodeMinAddr;
261 
262   size_t HugePage2MB = 2 << 20;
263   OS << format("  Hot code takes %.2lf%% of binary (%zu bytes out of %zu, "
264                "%.2lf huge pages)\n",
265                100.0 * HotCodeSize / TotalCodeSize, HotCodeSize, TotalCodeSize,
266                double(HotCodeSize) / HugePage2MB);
267 
268   // Stats related to expected cache performance
269   std::unordered_map<BinaryBasicBlock *, uint64_t> BBAddr;
270   std::unordered_map<BinaryBasicBlock *, uint64_t> BBSize;
271   extractBasicBlockInfo(BFs, BBAddr, BBSize);
272 
273   OS << "  Expected i-TLB cache hit ratio: "
274      << format("%.2lf%%\n", expectedCacheHitRatio(BFs, BBAddr, BBSize));
275 
276   auto Stats = calcTSPScore(BFs, BBAddr, BBSize);
277   OS << "  TSP score: "
278      << format("%.2lf%% (%zu out of %zu)\n",
279                100.0 * Stats.first / std::max<uint64_t>(Stats.second, 1),
280                Stats.first, Stats.second);
281 }
282