12f09f445SMaksim Panchenko //===- bolt/Passes/CacheMetrics.cpp - Metrics for instruction cache -------===//
2a34c753fSRafael Auler //
3a34c753fSRafael Auler // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4a34c753fSRafael Auler // See https://llvm.org/LICENSE.txt for license information.
5a34c753fSRafael Auler // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6a34c753fSRafael Auler //
7a34c753fSRafael Auler //===----------------------------------------------------------------------===//
8a34c753fSRafael Auler //
92f09f445SMaksim Panchenko // This file implements the CacheMetrics class and functions for showing metrics
102f09f445SMaksim Panchenko // of cache lines.
11a34c753fSRafael Auler //
12a34c753fSRafael Auler //===----------------------------------------------------------------------===//
13a34c753fSRafael Auler
14a34c753fSRafael Auler #include "bolt/Passes/CacheMetrics.h"
15a34c753fSRafael Auler #include "bolt/Core/BinaryBasicBlock.h"
16a34c753fSRafael Auler #include "bolt/Core/BinaryFunction.h"
17a34c753fSRafael Auler #include <unordered_map>
18a34c753fSRafael Auler
19a34c753fSRafael Auler using namespace llvm;
20a34c753fSRafael Auler using namespace bolt;
21a34c753fSRafael Auler
22a34c753fSRafael Auler namespace {
23a34c753fSRafael Auler
240daf303eSspupyrev /// The following constants are used to estimate the number of i-TLB cache
250daf303eSspupyrev /// misses for a given code layout. Empirically the values result in high
260daf303eSspupyrev /// correlations between the estimations and the perf measurements.
270daf303eSspupyrev /// The constants do not affect the code layout algorithms.
280daf303eSspupyrev constexpr unsigned ITLBPageSize = 4096;
290daf303eSspupyrev constexpr unsigned ITLBEntries = 16;
300daf303eSspupyrev
31a34c753fSRafael Auler /// Initialize and return a position map for binary basic blocks
extractBasicBlockInfo(const std::vector<BinaryFunction * > & BinaryFunctions,std::unordered_map<BinaryBasicBlock *,uint64_t> & BBAddr,std::unordered_map<BinaryBasicBlock *,uint64_t> & BBSize)32a34c753fSRafael Auler void extractBasicBlockInfo(
33a34c753fSRafael Auler const std::vector<BinaryFunction *> &BinaryFunctions,
34a34c753fSRafael Auler std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
35a34c753fSRafael Auler std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
36a34c753fSRafael Auler
37a34c753fSRafael Auler for (BinaryFunction *BF : BinaryFunctions) {
38a34c753fSRafael Auler const BinaryContext &BC = BF->getBinaryContext();
39d55dfeafSFabian Parzefall for (BinaryBasicBlock &BB : *BF) {
40a34c753fSRafael Auler if (BF->isSimple() || BC.HasRelocations) {
41a34c753fSRafael Auler // Use addresses/sizes as in the output binary
42d55dfeafSFabian Parzefall BBAddr[&BB] = BB.getOutputAddressRange().first;
43d55dfeafSFabian Parzefall BBSize[&BB] = BB.getOutputSize();
44a34c753fSRafael Auler } else {
45a34c753fSRafael Auler // Output ranges should match the input if the body hasn't changed
46d55dfeafSFabian Parzefall BBAddr[&BB] = BB.getInputAddressRange().first + BF->getAddress();
47d55dfeafSFabian Parzefall BBSize[&BB] = BB.getOriginalSize();
48a34c753fSRafael Auler }
49a34c753fSRafael Auler }
50a34c753fSRafael Auler }
51a34c753fSRafael Auler }
52a34c753fSRafael Auler
53a34c753fSRafael Auler /// Calculate TSP metric, which quantifies the number of fallthrough jumps in
54539b6c68Sspupyrev /// the ordering of basic blocks. The method returns a pair
55539b6c68Sspupyrev /// (the number of fallthrough branches, the total number of branches)
56539b6c68Sspupyrev std::pair<uint64_t, uint64_t>
calcTSPScore(const std::vector<BinaryFunction * > & BinaryFunctions,const std::unordered_map<BinaryBasicBlock *,uint64_t> & BBAddr,const std::unordered_map<BinaryBasicBlock *,uint64_t> & BBSize)5740c2e0faSMaksim Panchenko calcTSPScore(const std::vector<BinaryFunction *> &BinaryFunctions,
58a34c753fSRafael Auler const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
59a34c753fSRafael Auler const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
60539b6c68Sspupyrev uint64_t Score = 0;
61539b6c68Sspupyrev uint64_t JumpCount = 0;
62a34c753fSRafael Auler for (BinaryFunction *BF : BinaryFunctions) {
63a34c753fSRafael Auler if (!BF->hasProfile())
64a34c753fSRafael Auler continue;
65539b6c68Sspupyrev for (BinaryBasicBlock *SrcBB : BF->getLayout().blocks()) {
66539b6c68Sspupyrev auto BI = SrcBB->branch_info_begin();
67539b6c68Sspupyrev for (BinaryBasicBlock *DstBB : SrcBB->successors()) {
68539b6c68Sspupyrev if (SrcBB != DstBB && BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE) {
69539b6c68Sspupyrev JumpCount += BI->Count;
70*c8fc234eSshaw young
71*c8fc234eSshaw young auto BBAddrIt = BBAddr.find(SrcBB);
72*c8fc234eSshaw young assert(BBAddrIt != BBAddr.end());
73*c8fc234eSshaw young uint64_t SrcBBAddr = BBAddrIt->second;
74*c8fc234eSshaw young
75*c8fc234eSshaw young auto BBSizeIt = BBSize.find(SrcBB);
76*c8fc234eSshaw young assert(BBSizeIt != BBSize.end());
77*c8fc234eSshaw young uint64_t SrcBBSize = BBSizeIt->second;
78*c8fc234eSshaw young
79*c8fc234eSshaw young BBAddrIt = BBAddr.find(DstBB);
80*c8fc234eSshaw young assert(BBAddrIt != BBAddr.end());
81*c8fc234eSshaw young uint64_t DstBBAddr = BBAddrIt->second;
82*c8fc234eSshaw young
83*c8fc234eSshaw young if (SrcBBAddr + SrcBBSize == DstBBAddr)
84a34c753fSRafael Auler Score += BI->Count;
85539b6c68Sspupyrev }
86a34c753fSRafael Auler ++BI;
87a34c753fSRafael Auler }
88a34c753fSRafael Auler }
89a34c753fSRafael Auler }
90539b6c68Sspupyrev return std::make_pair(Score, JumpCount);
91a34c753fSRafael Auler }
92a34c753fSRafael Auler
93a34c753fSRafael Auler using Predecessors = std::vector<std::pair<BinaryFunction *, uint64_t>>;
94a34c753fSRafael Auler
95a34c753fSRafael Auler /// Build a simplified version of the call graph: For every function, keep
96a34c753fSRafael Auler /// its callers and the frequencies of the calls
97a34c753fSRafael Auler std::unordered_map<const BinaryFunction *, Predecessors>
extractFunctionCalls(const std::vector<BinaryFunction * > & BinaryFunctions)98a34c753fSRafael Auler extractFunctionCalls(const std::vector<BinaryFunction *> &BinaryFunctions) {
99a34c753fSRafael Auler std::unordered_map<const BinaryFunction *, Predecessors> Calls;
100a34c753fSRafael Auler
101a34c753fSRafael Auler for (BinaryFunction *SrcFunction : BinaryFunctions) {
102a34c753fSRafael Auler const BinaryContext &BC = SrcFunction->getBinaryContext();
103d5c03defSFabian Parzefall for (const BinaryBasicBlock *BB : SrcFunction->getLayout().blocks()) {
104a34c753fSRafael Auler // Find call instructions and extract target symbols from each one
105d5c03defSFabian Parzefall for (const MCInst &Inst : *BB) {
106a34c753fSRafael Auler if (!BC.MIB->isCall(Inst))
107a34c753fSRafael Auler continue;
108a34c753fSRafael Auler
109a34c753fSRafael Auler // Call info
110a34c753fSRafael Auler const MCSymbol *DstSym = BC.MIB->getTargetSymbol(Inst);
111a34c753fSRafael Auler uint64_t Count = BB->getKnownExecutionCount();
112a34c753fSRafael Auler // Ignore calls w/o information
113a34c753fSRafael Auler if (DstSym == nullptr || Count == 0)
114a34c753fSRafael Auler continue;
115a34c753fSRafael Auler
116a34c753fSRafael Auler const BinaryFunction *DstFunction = BC.getFunctionForSymbol(DstSym);
117a34c753fSRafael Auler // Ignore recursive calls
1188477bc67SFabian Parzefall if (DstFunction == nullptr || DstFunction->getLayout().block_empty() ||
119a34c753fSRafael Auler DstFunction == SrcFunction)
120a34c753fSRafael Auler continue;
121a34c753fSRafael Auler
122a34c753fSRafael Auler // Record the call
123a34c753fSRafael Auler Calls[DstFunction].emplace_back(SrcFunction, Count);
124a34c753fSRafael Auler }
125a34c753fSRafael Auler }
126a34c753fSRafael Auler }
127a34c753fSRafael Auler return Calls;
128a34c753fSRafael Auler }
129a34c753fSRafael Auler
130a34c753fSRafael Auler /// Compute expected hit ratio of the i-TLB cache (optimized by HFSortPlus alg).
131a34c753fSRafael Auler /// Given an assignment of functions to the i-TLB pages), we divide all
132a34c753fSRafael Auler /// functions calls into two categories:
133a34c753fSRafael Auler /// - 'short' ones that have a caller-callee distance less than a page;
134a34c753fSRafael Auler /// - 'long' ones where the distance exceeds a page.
13540c2e0faSMaksim Panchenko /// The short calls are likely to result in a i-TLB cache hit. For the long
13640c2e0faSMaksim Panchenko /// ones, the hit/miss result depends on the 'hotness' of the page (i.e., how
13740c2e0faSMaksim Panchenko /// often the page is accessed). Assuming that functions are sent to the i-TLB
13840c2e0faSMaksim Panchenko /// cache in a random order, the probability that a page is present in the cache
13940c2e0faSMaksim Panchenko /// is proportional to the number of samples corresponding to the functions on
14040c2e0faSMaksim Panchenko /// the page. The following procedure detects short and long calls, and
14140c2e0faSMaksim Panchenko /// estimates the expected number of cache misses for the long ones.
expectedCacheHitRatio(const std::vector<BinaryFunction * > & BinaryFunctions,const std::unordered_map<BinaryBasicBlock *,uint64_t> & BBAddr,const std::unordered_map<BinaryBasicBlock *,uint64_t> & BBSize)142a34c753fSRafael Auler double expectedCacheHitRatio(
143a34c753fSRafael Auler const std::vector<BinaryFunction *> &BinaryFunctions,
144a34c753fSRafael Auler const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBAddr,
145a34c753fSRafael Auler const std::unordered_map<BinaryBasicBlock *, uint64_t> &BBSize) {
146a34c753fSRafael Auler std::unordered_map<const BinaryFunction *, Predecessors> Calls =
147a34c753fSRafael Auler extractFunctionCalls(BinaryFunctions);
148a34c753fSRafael Auler // Compute 'hotness' of the functions
149a34c753fSRafael Auler double TotalSamples = 0;
150a34c753fSRafael Auler std::unordered_map<BinaryFunction *, double> FunctionSamples;
151a34c753fSRafael Auler for (BinaryFunction *BF : BinaryFunctions) {
152a34c753fSRafael Auler double Samples = 0;
153f92ab6afSAmir Ayupov for (std::pair<BinaryFunction *, uint64_t> Pair : Calls[BF])
154a34c753fSRafael Auler Samples += Pair.second;
155a34c753fSRafael Auler Samples = std::max(Samples, (double)BF->getKnownExecutionCount());
156a34c753fSRafael Auler FunctionSamples[BF] = Samples;
157a34c753fSRafael Auler TotalSamples += Samples;
158a34c753fSRafael Auler }
159a34c753fSRafael Auler
160a34c753fSRafael Auler // Compute 'hotness' of the pages
161a34c753fSRafael Auler std::unordered_map<uint64_t, double> PageSamples;
162a34c753fSRafael Auler for (BinaryFunction *BF : BinaryFunctions) {
1638477bc67SFabian Parzefall if (BF->getLayout().block_empty())
164a34c753fSRafael Auler continue;
165*c8fc234eSshaw young auto BBAddrIt = BBAddr.find(BF->getLayout().block_front());
166*c8fc234eSshaw young assert(BBAddrIt != BBAddr.end());
167*c8fc234eSshaw young const uint64_t Page = BBAddrIt->second / ITLBPageSize;
168*c8fc234eSshaw young
169*c8fc234eSshaw young auto FunctionSamplesIt = FunctionSamples.find(BF);
170*c8fc234eSshaw young assert(FunctionSamplesIt != FunctionSamples.end());
171*c8fc234eSshaw young PageSamples[Page] += FunctionSamplesIt->second;
172a34c753fSRafael Auler }
173a34c753fSRafael Auler
174a34c753fSRafael Auler // Computing the expected number of misses for every function
175a34c753fSRafael Auler double Misses = 0;
176a34c753fSRafael Auler for (BinaryFunction *BF : BinaryFunctions) {
177a34c753fSRafael Auler // Skip the function if it has no samples
178*c8fc234eSshaw young auto FunctionSamplesIt = FunctionSamples.find(BF);
179*c8fc234eSshaw young assert(FunctionSamplesIt != FunctionSamples.end());
180*c8fc234eSshaw young double Samples = FunctionSamplesIt->second;
181*c8fc234eSshaw young if (BF->getLayout().block_empty() || Samples == 0.0)
182a34c753fSRafael Auler continue;
183*c8fc234eSshaw young
184*c8fc234eSshaw young auto BBAddrIt = BBAddr.find(BF->getLayout().block_front());
185*c8fc234eSshaw young assert(BBAddrIt != BBAddr.end());
186*c8fc234eSshaw young const uint64_t Page = BBAddrIt->second / ITLBPageSize;
187a34c753fSRafael Auler // The probability that the page is not present in the cache
1880daf303eSspupyrev const double MissProb =
1890daf303eSspupyrev pow(1.0 - PageSamples[Page] / TotalSamples, ITLBEntries);
190a34c753fSRafael Auler
191a34c753fSRafael Auler // Processing all callers of the function
192a34c753fSRafael Auler for (std::pair<BinaryFunction *, uint64_t> Pair : Calls[BF]) {
193a34c753fSRafael Auler BinaryFunction *SrcFunction = Pair.first;
194*c8fc234eSshaw young
195*c8fc234eSshaw young BBAddrIt = BBAddr.find(SrcFunction->getLayout().block_front());
196*c8fc234eSshaw young assert(BBAddrIt != BBAddr.end());
197*c8fc234eSshaw young const uint64_t SrcPage = BBAddrIt->second / ITLBPageSize;
198a34c753fSRafael Auler // Is this a 'long' or a 'short' call?
199a34c753fSRafael Auler if (Page != SrcPage) {
200a34c753fSRafael Auler // This is a miss
201a34c753fSRafael Auler Misses += MissProb * Pair.second;
202a34c753fSRafael Auler }
203a34c753fSRafael Auler Samples -= Pair.second;
204a34c753fSRafael Auler }
205a34c753fSRafael Auler assert(Samples >= 0.0 && "Function samples computed incorrectly");
206a34c753fSRafael Auler // The remaining samples likely come from the jitted code
207a34c753fSRafael Auler Misses += Samples * MissProb;
208a34c753fSRafael Auler }
209a34c753fSRafael Auler
210a34c753fSRafael Auler return 100.0 * (1.0 - Misses / TotalSamples);
211a34c753fSRafael Auler }
212a34c753fSRafael Auler
21340c2e0faSMaksim Panchenko } // namespace
214a34c753fSRafael Auler
printAll(raw_ostream & OS,const std::vector<BinaryFunction * > & BFs)21552cf0711SAmir Ayupov void CacheMetrics::printAll(raw_ostream &OS,
21652cf0711SAmir Ayupov const std::vector<BinaryFunction *> &BFs) {
217a34c753fSRafael Auler // Stats related to hot-cold code splitting
218a34c753fSRafael Auler size_t NumFunctions = 0;
219a34c753fSRafael Auler size_t NumProfiledFunctions = 0;
220a34c753fSRafael Auler size_t NumHotFunctions = 0;
221a34c753fSRafael Auler size_t NumBlocks = 0;
222a34c753fSRafael Auler size_t NumHotBlocks = 0;
223a34c753fSRafael Auler
224a34c753fSRafael Auler size_t TotalCodeMinAddr = std::numeric_limits<size_t>::max();
225a34c753fSRafael Auler size_t TotalCodeMaxAddr = 0;
226a34c753fSRafael Auler size_t HotCodeMinAddr = std::numeric_limits<size_t>::max();
227a34c753fSRafael Auler size_t HotCodeMaxAddr = 0;
228a34c753fSRafael Auler
229a34c753fSRafael Auler for (BinaryFunction *BF : BFs) {
230a34c753fSRafael Auler NumFunctions++;
231a34c753fSRafael Auler if (BF->hasProfile())
232a34c753fSRafael Auler NumProfiledFunctions++;
233a34c753fSRafael Auler if (BF->hasValidIndex())
234a34c753fSRafael Auler NumHotFunctions++;
235d55dfeafSFabian Parzefall for (const BinaryBasicBlock &BB : *BF) {
236a34c753fSRafael Auler NumBlocks++;
237d55dfeafSFabian Parzefall size_t BBAddrMin = BB.getOutputAddressRange().first;
238d55dfeafSFabian Parzefall size_t BBAddrMax = BB.getOutputAddressRange().second;
239a34c753fSRafael Auler TotalCodeMinAddr = std::min(TotalCodeMinAddr, BBAddrMin);
240a34c753fSRafael Auler TotalCodeMaxAddr = std::max(TotalCodeMaxAddr, BBAddrMax);
241d55dfeafSFabian Parzefall if (BF->hasValidIndex() && !BB.isCold()) {
242a34c753fSRafael Auler NumHotBlocks++;
243a34c753fSRafael Auler HotCodeMinAddr = std::min(HotCodeMinAddr, BBAddrMin);
244a34c753fSRafael Auler HotCodeMaxAddr = std::max(HotCodeMaxAddr, BBAddrMax);
245a34c753fSRafael Auler }
246a34c753fSRafael Auler }
247a34c753fSRafael Auler }
248a34c753fSRafael Auler
24952cf0711SAmir Ayupov OS << format(" There are %zu functions;", NumFunctions)
25040c2e0faSMaksim Panchenko << format(" %zu (%.2lf%%) are in the hot section,", NumHotFunctions,
25140c2e0faSMaksim Panchenko 100.0 * NumHotFunctions / NumFunctions)
25240c2e0faSMaksim Panchenko << format(" %zu (%.2lf%%) have profile\n", NumProfiledFunctions,
25340c2e0faSMaksim Panchenko 100.0 * NumProfiledFunctions / NumFunctions);
25452cf0711SAmir Ayupov OS << format(" There are %zu basic blocks;", NumBlocks)
25540c2e0faSMaksim Panchenko << format(" %zu (%.2lf%%) are in the hot section\n", NumHotBlocks,
25640c2e0faSMaksim Panchenko 100.0 * NumHotBlocks / NumBlocks);
257a34c753fSRafael Auler
258a34c753fSRafael Auler assert(TotalCodeMinAddr <= TotalCodeMaxAddr && "incorrect output addresses");
259a34c753fSRafael Auler size_t HotCodeSize = HotCodeMaxAddr - HotCodeMinAddr;
260a34c753fSRafael Auler size_t TotalCodeSize = TotalCodeMaxAddr - TotalCodeMinAddr;
261a34c753fSRafael Auler
262a34c753fSRafael Auler size_t HugePage2MB = 2 << 20;
26352cf0711SAmir Ayupov OS << format(" Hot code takes %.2lf%% of binary (%zu bytes out of %zu, "
26440c2e0faSMaksim Panchenko "%.2lf huge pages)\n",
26552cf0711SAmir Ayupov 100.0 * HotCodeSize / TotalCodeSize, HotCodeSize, TotalCodeSize,
26652cf0711SAmir Ayupov double(HotCodeSize) / HugePage2MB);
267a34c753fSRafael Auler
268a34c753fSRafael Auler // Stats related to expected cache performance
269a34c753fSRafael Auler std::unordered_map<BinaryBasicBlock *, uint64_t> BBAddr;
270a34c753fSRafael Auler std::unordered_map<BinaryBasicBlock *, uint64_t> BBSize;
271a34c753fSRafael Auler extractBasicBlockInfo(BFs, BBAddr, BBSize);
272a34c753fSRafael Auler
27352cf0711SAmir Ayupov OS << " Expected i-TLB cache hit ratio: "
274a34c753fSRafael Auler << format("%.2lf%%\n", expectedCacheHitRatio(BFs, BBAddr, BBSize));
275a34c753fSRafael Auler
276539b6c68Sspupyrev auto Stats = calcTSPScore(BFs, BBAddr, BBSize);
27752cf0711SAmir Ayupov OS << " TSP score: "
278539b6c68Sspupyrev << format("%.2lf%% (%zu out of %zu)\n",
279539b6c68Sspupyrev 100.0 * Stats.first / std::max<uint64_t>(Stats.second, 1),
280539b6c68Sspupyrev Stats.first, Stats.second);
281a34c753fSRafael Auler }
282