1 //===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "CSPreInliner.h"
10 #include "ProfiledBinary.h"
11 #include "llvm/ADT/SCCIterator.h"
12 #include "llvm/ADT/Statistic.h"
13 #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
14 #include <cstdint>
15 #include <queue>
16
17 #define DEBUG_TYPE "cs-preinliner"
18
19 using namespace llvm;
20 using namespace sampleprof;
21
22 STATISTIC(PreInlNumCSInlined,
23 "Number of functions inlined with context sensitive profile");
24 STATISTIC(PreInlNumCSNotInlined,
25 "Number of functions not inlined with context sensitive profile");
26 STATISTIC(PreInlNumCSInlinedHitMinLimit,
27 "Number of functions with FDO inline stopped due to min size limit");
28 STATISTIC(PreInlNumCSInlinedHitMaxLimit,
29 "Number of functions with FDO inline stopped due to max size limit");
30 STATISTIC(
31 PreInlNumCSInlinedHitGrowthLimit,
32 "Number of functions with FDO inline stopped due to growth size limit");
33
34 // The switches specify inline thresholds used in SampleProfileLoader inlining.
35 // TODO: the actual threshold to be tuned here because the size here is based
36 // on machine code not LLVM IR.
37 extern cl::opt<int> SampleHotCallSiteThreshold;
38 extern cl::opt<int> SampleColdCallSiteThreshold;
39 extern cl::opt<int> ProfileInlineGrowthLimit;
40 extern cl::opt<int> ProfileInlineLimitMin;
41 extern cl::opt<int> ProfileInlineLimitMax;
42 extern cl::opt<bool> SortProfiledSCC;
43
44 cl::opt<bool> EnableCSPreInliner(
45 "csspgo-preinliner", cl::Hidden, cl::init(true),
46 cl::desc("Run a global pre-inliner to merge context profile based on "
47 "estimated global top-down inline decisions"));
48
49 cl::opt<bool> UseContextCostForPreInliner(
50 "use-context-cost-for-preinliner", cl::Hidden, cl::init(true),
51 cl::desc("Use context-sensitive byte size cost for preinliner decisions"));
52
53 static cl::opt<bool> SamplePreInlineReplay(
54 "csspgo-replay-preinline", cl::Hidden, cl::init(false),
55 cl::desc(
56 "Replay previous inlining and adjust context profile accordingly"));
57
CSPreInliner(SampleContextTracker & Tracker,ProfiledBinary & Binary,ProfileSummary * Summary)58 CSPreInliner::CSPreInliner(SampleContextTracker &Tracker,
59 ProfiledBinary &Binary, ProfileSummary *Summary)
60 : UseContextCost(UseContextCostForPreInliner),
61 // TODO: Pass in a guid-to-name map in order for
62 // ContextTracker.getFuncNameFor to work, if `Profiles` can have md5 codes
63 // as their profile context.
64 ContextTracker(Tracker), Binary(Binary), Summary(Summary) {
65 // Set default preinliner hot/cold call site threshold tuned with CSSPGO.
66 // for good performance with reasonable profile size.
67 if (!SampleHotCallSiteThreshold.getNumOccurrences())
68 SampleHotCallSiteThreshold = 1500;
69 if (!SampleColdCallSiteThreshold.getNumOccurrences())
70 SampleColdCallSiteThreshold = 0;
71 if (!ProfileInlineLimitMax.getNumOccurrences())
72 ProfileInlineLimitMax = 3000;
73 }
74
buildTopDownOrder()75 std::vector<StringRef> CSPreInliner::buildTopDownOrder() {
76 std::vector<StringRef> Order;
77 ProfiledCallGraph ProfiledCG(ContextTracker);
78
79 // Now that we have a profiled call graph, construct top-down order
80 // by building up SCC and reversing SCC order.
81 scc_iterator<ProfiledCallGraph *> I = scc_begin(&ProfiledCG);
82 while (!I.isAtEnd()) {
83 auto Range = *I;
84 if (SortProfiledSCC) {
85 // Sort nodes in one SCC based on callsite hotness.
86 scc_member_iterator<ProfiledCallGraph *> SI(*I);
87 Range = *SI;
88 }
89 for (auto *Node : Range) {
90 if (Node != ProfiledCG.getEntryNode())
91 Order.push_back(Node->Name);
92 }
93 ++I;
94 }
95 std::reverse(Order.begin(), Order.end());
96
97 return Order;
98 }
99
getInlineCandidates(ProfiledCandidateQueue & CQueue,const FunctionSamples * CallerSamples)100 bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue,
101 const FunctionSamples *CallerSamples) {
102 assert(CallerSamples && "Expect non-null caller samples");
103
104 // Ideally we want to consider everything a function calls, but as far as
105 // context profile is concerned, only those frames that are children of
106 // current one in the trie is relavent. So we walk the trie instead of call
107 // targets from function profile.
108 ContextTrieNode *CallerNode =
109 ContextTracker.getContextNodeForProfile(CallerSamples);
110
111 bool HasNewCandidate = false;
112 for (auto &Child : CallerNode->getAllChildContext()) {
113 ContextTrieNode *CalleeNode = &Child.second;
114 FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples();
115 if (!CalleeSamples)
116 continue;
117
118 // Call site count is more reliable, so we look up the corresponding call
119 // target profile in caller's context profile to retrieve call site count.
120 uint64_t CalleeEntryCount = CalleeSamples->getHeadSamplesEstimate();
121 uint64_t CallsiteCount = 0;
122 LineLocation Callsite = CalleeNode->getCallSiteLoc();
123 if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) {
124 SampleRecord::CallTargetMap &TargetCounts = CallTargets.get();
125 auto It = TargetCounts.find(CalleeSamples->getName());
126 if (It != TargetCounts.end())
127 CallsiteCount = It->second;
128 }
129
130 // TODO: call site and callee entry count should be mostly consistent, add
131 // check for that.
132 HasNewCandidate = true;
133 uint32_t CalleeSize = getFuncSize(CalleeNode);
134 CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount),
135 CalleeSize);
136 }
137
138 return HasNewCandidate;
139 }
140
getFuncSize(const ContextTrieNode * ContextNode)141 uint32_t CSPreInliner::getFuncSize(const ContextTrieNode *ContextNode) {
142 if (UseContextCost)
143 return Binary.getFuncSizeForContext(ContextNode);
144
145 return ContextNode->getFunctionSamples()->getBodySamples().size();
146 }
147
shouldInline(ProfiledInlineCandidate & Candidate)148 bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) {
149 // If replay inline is requested, simply follow the inline decision of the
150 // profiled binary.
151 if (SamplePreInlineReplay)
152 return Candidate.CalleeSamples->getContext().hasAttribute(
153 ContextWasInlined);
154
155 unsigned int SampleThreshold = SampleColdCallSiteThreshold;
156 uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(
157 (Summary->getDetailedSummary()));
158
159 if (Candidate.CallsiteCount <= ColdCountThreshold)
160 SampleThreshold = SampleColdCallSiteThreshold;
161 else {
162 // Linearly adjust threshold based on normalized hotness, i.e, a value in
163 // [0,1]. Use 10% cutoff instead of the max count as the normalization
164 // upperbound for stability.
165 double NormalizationUpperBound =
166 ProfileSummaryBuilder::getEntryForPercentile(
167 Summary->getDetailedSummary(), 100000 /* 10% */)
168 .MinCount;
169 double NormalizationLowerBound = ColdCountThreshold;
170 double NormalizedHotness =
171 (Candidate.CallsiteCount - NormalizationLowerBound) /
172 (NormalizationUpperBound - NormalizationLowerBound);
173 if (NormalizedHotness > 1.0)
174 NormalizedHotness = 1.0;
175 // Add 1 to to ensure hot callsites get a non-zero threshold, which could
176 // happen when SampleColdCallSiteThreshold is 0. This is when we do not
177 // want any inlining for cold callsites.
178 SampleThreshold = SampleHotCallSiteThreshold * NormalizedHotness * 100 +
179 SampleColdCallSiteThreshold + 1;
180 }
181
182 return (Candidate.SizeCost < SampleThreshold);
183 }
184
processFunction(const StringRef Name)185 void CSPreInliner::processFunction(const StringRef Name) {
186 FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name);
187 if (!FSamples)
188 return;
189
190 unsigned FuncSize =
191 getFuncSize(ContextTracker.getContextNodeForProfile(FSamples));
192 unsigned FuncFinalSize = FuncSize;
193 unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit;
194 SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
195 SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
196
197 LLVM_DEBUG(dbgs() << "Process " << Name
198 << " for context-sensitive pre-inlining (pre-inline size: "
199 << FuncSize << ", size limit: " << SizeLimit << ")\n");
200
201 ProfiledCandidateQueue CQueue;
202 getInlineCandidates(CQueue, FSamples);
203
204 while (!CQueue.empty() && FuncFinalSize < SizeLimit) {
205 ProfiledInlineCandidate Candidate = CQueue.top();
206 CQueue.pop();
207 bool ShouldInline = false;
208 if ((ShouldInline = shouldInline(Candidate))) {
209 // We mark context as inlined as the corresponding context profile
210 // won't be merged into that function's base profile.
211 ++PreInlNumCSInlined;
212 ContextTracker.markContextSamplesInlined(Candidate.CalleeSamples);
213 Candidate.CalleeSamples->getContext().setAttribute(
214 ContextShouldBeInlined);
215 FuncFinalSize += Candidate.SizeCost;
216 getInlineCandidates(CQueue, Candidate.CalleeSamples);
217 } else {
218 ++PreInlNumCSNotInlined;
219 }
220 LLVM_DEBUG(
221 dbgs() << (ShouldInline ? " Inlined" : " Outlined")
222 << " context profile for: "
223 << ContextTracker.getContextString(*Candidate.CalleeSamples)
224 << " (callee size: " << Candidate.SizeCost
225 << ", call count:" << Candidate.CallsiteCount << ")\n");
226 }
227
228 if (!CQueue.empty()) {
229 if (SizeLimit == (unsigned)ProfileInlineLimitMax)
230 ++PreInlNumCSInlinedHitMaxLimit;
231 else if (SizeLimit == (unsigned)ProfileInlineLimitMin)
232 ++PreInlNumCSInlinedHitMinLimit;
233 else
234 ++PreInlNumCSInlinedHitGrowthLimit;
235 }
236
237 LLVM_DEBUG({
238 if (!CQueue.empty())
239 dbgs() << " Inline candidates ignored due to size limit (inliner "
240 "original size: "
241 << FuncSize << ", inliner final size: " << FuncFinalSize
242 << ", size limit: " << SizeLimit << ")\n";
243
244 while (!CQueue.empty()) {
245 ProfiledInlineCandidate Candidate = CQueue.top();
246 CQueue.pop();
247 bool WasInlined =
248 Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined);
249 dbgs() << " "
250 << ContextTracker.getContextString(*Candidate.CalleeSamples)
251 << " (candidate size:" << Candidate.SizeCost
252 << ", call count: " << Candidate.CallsiteCount << ", previously "
253 << (WasInlined ? "inlined)\n" : "not inlined)\n");
254 }
255 });
256 }
257
run()258 void CSPreInliner::run() {
259 #ifndef NDEBUG
260 auto printProfileNames = [](SampleContextTracker &ContextTracker,
261 bool IsInput) {
262 uint32_t Size = 0;
263 for (auto *Node : ContextTracker) {
264 FunctionSamples *FSamples = Node->getFunctionSamples();
265 if (FSamples) {
266 Size++;
267 dbgs() << " [" << ContextTracker.getContextString(Node) << "] "
268 << FSamples->getTotalSamples() << ":"
269 << FSamples->getHeadSamples() << "\n";
270 }
271 }
272 dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles ("
273 << Size << " total):\n";
274 };
275 #endif
276
277 LLVM_DEBUG(printProfileNames(ContextTracker, true));
278
279 // Execute global pre-inliner to estimate a global top-down inline
280 // decision and merge profiles accordingly. This helps with profile
281 // merge for ThinLTO otherwise we won't be able to merge profiles back
282 // to base profile across module/thin-backend boundaries.
283 // It also helps better compress context profile to control profile
284 // size, as we now only need context profile for functions going to
285 // be inlined.
286 for (StringRef FuncName : buildTopDownOrder()) {
287 processFunction(FuncName);
288 }
289
290 // Not inlined context profiles are merged into its base, so we can
291 // trim out such profiles from the output.
292 for (auto *Node : ContextTracker) {
293 FunctionSamples *FProfile = Node->getFunctionSamples();
294 if (FProfile &&
295 (Node->getParentContext() != &ContextTracker.getRootContext() &&
296 !FProfile->getContext().hasState(InlinedContext))) {
297 Node->setFunctionSamples(nullptr);
298 }
299 }
300 FunctionSamples::ProfileIsPreInlined = true;
301
302 LLVM_DEBUG(printProfileNames(ContextTracker, false));
303 }
304