xref: /llvm-project/llvm/tools/llvm-profgen/CSPreInliner.cpp (revision 9423e459875b0dcdf24975976838d651a92f1bdb)
1 //===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "CSPreInliner.h"
10 #include "ProfiledBinary.h"
11 #include "llvm/ADT/SCCIterator.h"
12 #include "llvm/ADT/Statistic.h"
13 #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
14 #include "llvm/Transforms/IPO/SampleProfile.h"
15 #include <cstdint>
16 #include <queue>
17 
18 #define DEBUG_TYPE "cs-preinliner"
19 
20 using namespace llvm;
21 using namespace sampleprof;
22 
23 STATISTIC(PreInlNumCSInlined,
24           "Number of functions inlined with context sensitive profile");
25 STATISTIC(PreInlNumCSNotInlined,
26           "Number of functions not inlined with context sensitive profile");
27 STATISTIC(PreInlNumCSInlinedHitMinLimit,
28           "Number of functions with FDO inline stopped due to min size limit");
29 STATISTIC(PreInlNumCSInlinedHitMaxLimit,
30           "Number of functions with FDO inline stopped due to max size limit");
31 STATISTIC(
32     PreInlNumCSInlinedHitGrowthLimit,
33     "Number of functions with FDO inline stopped due to growth size limit");
34 
35 // The switches specify inline thresholds used in SampleProfileLoader inlining.
36 // TODO: the actual threshold to be tuned here because the size here is based
37 // on machine code not LLVM IR.
38 namespace llvm {
39 cl::opt<bool> EnableCSPreInliner(
40     "csspgo-preinliner", cl::Hidden, cl::init(true),
41     cl::desc("Run a global pre-inliner to merge context profile based on "
42              "estimated global top-down inline decisions"));
43 
44 cl::opt<bool> UseContextCostForPreInliner(
45     "use-context-cost-for-preinliner", cl::Hidden, cl::init(true),
46     cl::desc("Use context-sensitive byte size cost for preinliner decisions"));
47 } // namespace llvm
48 
49 static cl::opt<bool> SamplePreInlineReplay(
50     "csspgo-replay-preinline", cl::Hidden, cl::init(false),
51     cl::desc(
52         "Replay previous inlining and adjust context profile accordingly"));
53 
54 static cl::opt<int> CSPreinlMultiplierForPrevInl(
55     "csspgo-preinliner-multiplier-for-previous-inlining", cl::Hidden,
56     cl::init(100),
57     cl::desc(
58         "Multiplier to bump up callsite threshold for previous inlining."));
59 
CSPreInliner(SampleContextTracker & Tracker,ProfiledBinary & Binary,ProfileSummary * Summary)60 CSPreInliner::CSPreInliner(SampleContextTracker &Tracker,
61                            ProfiledBinary &Binary, ProfileSummary *Summary)
62     : UseContextCost(UseContextCostForPreInliner),
63       // TODO: Pass in a guid-to-name map in order for
64       // ContextTracker.getFuncNameFor to work, if `Profiles` can have md5 codes
65       // as their profile context.
66       ContextTracker(Tracker), Binary(Binary), Summary(Summary) {
67   // Set default preinliner hot/cold call site threshold tuned with CSSPGO.
68   // for good performance with reasonable profile size.
69   if (!SampleHotCallSiteThreshold.getNumOccurrences())
70     SampleHotCallSiteThreshold = 1500;
71   if (!SampleColdCallSiteThreshold.getNumOccurrences())
72     SampleColdCallSiteThreshold = 0;
73   if (!ProfileInlineLimitMax.getNumOccurrences())
74     ProfileInlineLimitMax = 50000;
75 }
76 
buildTopDownOrder()77 std::vector<FunctionId> CSPreInliner::buildTopDownOrder() {
78   std::vector<FunctionId> Order;
79   // Trim cold edges to get a more stable call graph. This allows for a more
80   // stable top-down order which in turns helps the stablity of the generated
81   // profile from run to run.
82   uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(
83       (Summary->getDetailedSummary()));
84   ProfiledCallGraph ProfiledCG(ContextTracker, ColdCountThreshold);
85 
86   // Now that we have a profiled call graph, construct top-down order
87   // by building up SCC and reversing SCC order.
88   scc_iterator<ProfiledCallGraph *> I = scc_begin(&ProfiledCG);
89   while (!I.isAtEnd()) {
90     auto Range = *I;
91     if (SortProfiledSCC) {
92       // Sort nodes in one SCC based on callsite hotness.
93       scc_member_iterator<ProfiledCallGraph *> SI(*I);
94       Range = *SI;
95     }
96     for (auto *Node : Range) {
97       if (Node != ProfiledCG.getEntryNode())
98         Order.push_back(Node->Name);
99     }
100     ++I;
101   }
102   std::reverse(Order.begin(), Order.end());
103 
104   return Order;
105 }
106 
getInlineCandidates(ProfiledCandidateQueue & CQueue,const FunctionSamples * CallerSamples)107 bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue,
108                                        const FunctionSamples *CallerSamples) {
109   assert(CallerSamples && "Expect non-null caller samples");
110 
111   // Ideally we want to consider everything a function calls, but as far as
112   // context profile is concerned, only those frames that are children of
113   // current one in the trie is relavent. So we walk the trie instead of call
114   // targets from function profile.
115   ContextTrieNode *CallerNode =
116       ContextTracker.getContextNodeForProfile(CallerSamples);
117 
118   bool HasNewCandidate = false;
119   for (auto &Child : CallerNode->getAllChildContext()) {
120     ContextTrieNode *CalleeNode = &Child.second;
121     FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples();
122     if (!CalleeSamples)
123       continue;
124 
125     // Call site count is more reliable, so we look up the corresponding call
126     // target profile in caller's context profile to retrieve call site count.
127     uint64_t CalleeEntryCount = CalleeSamples->getHeadSamplesEstimate();
128     uint64_t CallsiteCount = 0;
129     LineLocation Callsite = CalleeNode->getCallSiteLoc();
130     if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) {
131       auto It = CallTargets->find(CalleeSamples->getFunction());
132       if (It != CallTargets->end())
133         CallsiteCount = It->second;
134     }
135 
136     // TODO: call site and callee entry count should be mostly consistent, add
137     // check for that.
138     HasNewCandidate = true;
139     uint32_t CalleeSize = getFuncSize(CalleeNode);
140     CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount),
141                    CalleeSize);
142   }
143 
144   return HasNewCandidate;
145 }
146 
getFuncSize(const ContextTrieNode * ContextNode)147 uint32_t CSPreInliner::getFuncSize(const ContextTrieNode *ContextNode) {
148   if (UseContextCost)
149     return Binary.getFuncSizeForContext(ContextNode);
150 
151   return ContextNode->getFunctionSamples()->getBodySamples().size();
152 }
153 
shouldInline(ProfiledInlineCandidate & Candidate)154 bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) {
155   bool WasInlined =
156       Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined);
157   // If replay inline is requested, simply follow the inline decision of the
158   // profiled binary.
159   if (SamplePreInlineReplay)
160     return WasInlined;
161 
162   unsigned int SampleThreshold = SampleColdCallSiteThreshold;
163   uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(
164       (Summary->getDetailedSummary()));
165 
166   if (Candidate.CallsiteCount <= ColdCountThreshold)
167     SampleThreshold = SampleColdCallSiteThreshold;
168   else {
169     // Linearly adjust threshold based on normalized hotness, i.e, a value in
170     // [0,1]. Use 10% cutoff instead of the max count as the normalization
171     // upperbound for stability.
172     double NormalizationUpperBound =
173         ProfileSummaryBuilder::getEntryForPercentile(
174             Summary->getDetailedSummary(), 100000 /* 10% */)
175             .MinCount;
176     double NormalizationLowerBound = ColdCountThreshold;
177     double NormalizedHotness =
178         (Candidate.CallsiteCount - NormalizationLowerBound) /
179         (NormalizationUpperBound - NormalizationLowerBound);
180     if (NormalizedHotness > 1.0)
181       NormalizedHotness = 1.0;
182     // Add 1 to ensure hot callsites get a non-zero threshold, which could
183     // happen when SampleColdCallSiteThreshold is 0. This is when we do not
184     // want any inlining for cold callsites.
185     SampleThreshold = SampleHotCallSiteThreshold * NormalizedHotness * 100 +
186                       SampleColdCallSiteThreshold + 1;
187     // Bump up the threshold to favor previous compiler inline decision. The
188     // compiler has more insight and knowledge about functions based on their IR
189     // and attribures and should be able to make a more reasonable inline
190     // decision.
191     if (WasInlined)
192       SampleThreshold *= CSPreinlMultiplierForPrevInl;
193   }
194 
195   return (Candidate.SizeCost < SampleThreshold);
196 }
197 
processFunction(const FunctionId Name)198 void CSPreInliner::processFunction(const FunctionId Name) {
199   FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name);
200   if (!FSamples)
201     return;
202 
203   unsigned FuncSize =
204       getFuncSize(ContextTracker.getContextNodeForProfile(FSamples));
205   unsigned FuncFinalSize = FuncSize;
206   unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit;
207   SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
208   SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
209 
210   LLVM_DEBUG(dbgs() << "Process " << Name
211                     << " for context-sensitive pre-inlining (pre-inline size: "
212                     << FuncSize << ", size limit: " << SizeLimit << ")\n");
213 
214   ProfiledCandidateQueue CQueue;
215   getInlineCandidates(CQueue, FSamples);
216 
217   while (!CQueue.empty() && FuncFinalSize < SizeLimit) {
218     ProfiledInlineCandidate Candidate = CQueue.top();
219     CQueue.pop();
220     bool ShouldInline = false;
221     if ((ShouldInline = shouldInline(Candidate))) {
222       // We mark context as inlined as the corresponding context profile
223       // won't be merged into that function's base profile.
224       ++PreInlNumCSInlined;
225       ContextTracker.markContextSamplesInlined(Candidate.CalleeSamples);
226       Candidate.CalleeSamples->getContext().setAttribute(
227           ContextShouldBeInlined);
228       FuncFinalSize += Candidate.SizeCost;
229       getInlineCandidates(CQueue, Candidate.CalleeSamples);
230     } else {
231       ++PreInlNumCSNotInlined;
232     }
233     LLVM_DEBUG(
234         dbgs() << (ShouldInline ? "  Inlined" : "  Outlined")
235                << " context profile for: "
236                << ContextTracker.getContextString(*Candidate.CalleeSamples)
237                << " (callee size: " << Candidate.SizeCost
238                << ", call count:" << Candidate.CallsiteCount << ")\n");
239   }
240 
241   if (!CQueue.empty()) {
242     if (SizeLimit == (unsigned)ProfileInlineLimitMax)
243       ++PreInlNumCSInlinedHitMaxLimit;
244     else if (SizeLimit == (unsigned)ProfileInlineLimitMin)
245       ++PreInlNumCSInlinedHitMinLimit;
246     else
247       ++PreInlNumCSInlinedHitGrowthLimit;
248   }
249 
250   LLVM_DEBUG({
251     if (!CQueue.empty())
252       dbgs() << "  Inline candidates ignored due to size limit (inliner "
253                 "original size: "
254              << FuncSize << ", inliner final size: " << FuncFinalSize
255              << ", size limit: " << SizeLimit << ")\n";
256 
257     while (!CQueue.empty()) {
258       ProfiledInlineCandidate Candidate = CQueue.top();
259       CQueue.pop();
260       bool WasInlined =
261           Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined);
262       dbgs() << "    "
263              << ContextTracker.getContextString(*Candidate.CalleeSamples)
264              << " (candidate size:" << Candidate.SizeCost
265              << ", call count: " << Candidate.CallsiteCount << ", previously "
266              << (WasInlined ? "inlined)\n" : "not inlined)\n");
267     }
268   });
269 }
270 
run()271 void CSPreInliner::run() {
272 #ifndef NDEBUG
273   auto printProfileNames = [](SampleContextTracker &ContextTracker,
274                               bool IsInput) {
275     uint32_t Size = 0;
276     for (auto *Node : ContextTracker) {
277       FunctionSamples *FSamples = Node->getFunctionSamples();
278       if (FSamples) {
279         Size++;
280         dbgs() << "  [" << ContextTracker.getContextString(Node) << "] "
281                << FSamples->getTotalSamples() << ":"
282                << FSamples->getHeadSamples() << "\n";
283       }
284     }
285     dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles ("
286            << Size << " total):\n";
287   };
288 #endif
289 
290   LLVM_DEBUG(printProfileNames(ContextTracker, true));
291 
292   // Execute global pre-inliner to estimate a global top-down inline
293   // decision and merge profiles accordingly. This helps with profile
294   // merge for ThinLTO otherwise we won't be able to merge profiles back
295   // to base profile across module/thin-backend boundaries.
296   // It also helps better compress context profile to control profile
297   // size, as we now only need context profile for functions going to
298   // be inlined.
299   for (FunctionId FuncName : buildTopDownOrder()) {
300     processFunction(FuncName);
301   }
302 
303   // Not inlined context profiles are merged into its base, so we can
304   // trim out such profiles from the output.
305   for (auto *Node : ContextTracker) {
306     FunctionSamples *FProfile = Node->getFunctionSamples();
307     if (FProfile &&
308         (Node->getParentContext() != &ContextTracker.getRootContext() &&
309          !FProfile->getContext().hasState(InlinedContext))) {
310       Node->setFunctionSamples(nullptr);
311     }
312   }
313   FunctionSamples::ProfileIsPreInlined = true;
314 
315   LLVM_DEBUG(printProfileNames(ContextTracker, false));
316 }
317