xref: /openbsd-src/gnu/llvm/llvm/tools/llvm-profgen/CSPreInliner.cpp (revision d415bd752c734aee168c4ee86ff32e8cc249eb16)
1 //===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "CSPreInliner.h"
10 #include "ProfiledBinary.h"
11 #include "llvm/ADT/SCCIterator.h"
12 #include "llvm/ADT/Statistic.h"
13 #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
14 #include <cstdint>
15 #include <queue>
16 
17 #define DEBUG_TYPE "cs-preinliner"
18 
19 using namespace llvm;
20 using namespace sampleprof;
21 
22 STATISTIC(PreInlNumCSInlined,
23           "Number of functions inlined with context sensitive profile");
24 STATISTIC(PreInlNumCSNotInlined,
25           "Number of functions not inlined with context sensitive profile");
26 STATISTIC(PreInlNumCSInlinedHitMinLimit,
27           "Number of functions with FDO inline stopped due to min size limit");
28 STATISTIC(PreInlNumCSInlinedHitMaxLimit,
29           "Number of functions with FDO inline stopped due to max size limit");
30 STATISTIC(
31     PreInlNumCSInlinedHitGrowthLimit,
32     "Number of functions with FDO inline stopped due to growth size limit");
33 
34 // The switches specify inline thresholds used in SampleProfileLoader inlining.
35 // TODO: the actual threshold to be tuned here because the size here is based
36 // on machine code not LLVM IR.
37 extern cl::opt<int> SampleHotCallSiteThreshold;
38 extern cl::opt<int> SampleColdCallSiteThreshold;
39 extern cl::opt<int> ProfileInlineGrowthLimit;
40 extern cl::opt<int> ProfileInlineLimitMin;
41 extern cl::opt<int> ProfileInlineLimitMax;
42 extern cl::opt<bool> SortProfiledSCC;
43 
44 cl::opt<bool> EnableCSPreInliner(
45     "csspgo-preinliner", cl::Hidden, cl::init(true),
46     cl::desc("Run a global pre-inliner to merge context profile based on "
47              "estimated global top-down inline decisions"));
48 
49 cl::opt<bool> UseContextCostForPreInliner(
50     "use-context-cost-for-preinliner", cl::Hidden, cl::init(true),
51     cl::desc("Use context-sensitive byte size cost for preinliner decisions"));
52 
53 static cl::opt<bool> SamplePreInlineReplay(
54     "csspgo-replay-preinline", cl::Hidden, cl::init(false),
55     cl::desc(
56         "Replay previous inlining and adjust context profile accordingly"));
57 
CSPreInliner(SampleContextTracker & Tracker,ProfiledBinary & Binary,ProfileSummary * Summary)58 CSPreInliner::CSPreInliner(SampleContextTracker &Tracker,
59                            ProfiledBinary &Binary, ProfileSummary *Summary)
60     : UseContextCost(UseContextCostForPreInliner),
61       // TODO: Pass in a guid-to-name map in order for
62       // ContextTracker.getFuncNameFor to work, if `Profiles` can have md5 codes
63       // as their profile context.
64       ContextTracker(Tracker), Binary(Binary), Summary(Summary) {
65   // Set default preinliner hot/cold call site threshold tuned with CSSPGO.
66   // for good performance with reasonable profile size.
67   if (!SampleHotCallSiteThreshold.getNumOccurrences())
68     SampleHotCallSiteThreshold = 1500;
69   if (!SampleColdCallSiteThreshold.getNumOccurrences())
70     SampleColdCallSiteThreshold = 0;
71   if (!ProfileInlineLimitMax.getNumOccurrences())
72     ProfileInlineLimitMax = 3000;
73 }
74 
buildTopDownOrder()75 std::vector<StringRef> CSPreInliner::buildTopDownOrder() {
76   std::vector<StringRef> Order;
77   ProfiledCallGraph ProfiledCG(ContextTracker);
78 
79   // Now that we have a profiled call graph, construct top-down order
80   // by building up SCC and reversing SCC order.
81   scc_iterator<ProfiledCallGraph *> I = scc_begin(&ProfiledCG);
82   while (!I.isAtEnd()) {
83     auto Range = *I;
84     if (SortProfiledSCC) {
85       // Sort nodes in one SCC based on callsite hotness.
86       scc_member_iterator<ProfiledCallGraph *> SI(*I);
87       Range = *SI;
88     }
89     for (auto *Node : Range) {
90       if (Node != ProfiledCG.getEntryNode())
91         Order.push_back(Node->Name);
92     }
93     ++I;
94   }
95   std::reverse(Order.begin(), Order.end());
96 
97   return Order;
98 }
99 
getInlineCandidates(ProfiledCandidateQueue & CQueue,const FunctionSamples * CallerSamples)100 bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue,
101                                        const FunctionSamples *CallerSamples) {
102   assert(CallerSamples && "Expect non-null caller samples");
103 
104   // Ideally we want to consider everything a function calls, but as far as
105   // context profile is concerned, only those frames that are children of
106   // current one in the trie is relavent. So we walk the trie instead of call
107   // targets from function profile.
108   ContextTrieNode *CallerNode =
109       ContextTracker.getContextNodeForProfile(CallerSamples);
110 
111   bool HasNewCandidate = false;
112   for (auto &Child : CallerNode->getAllChildContext()) {
113     ContextTrieNode *CalleeNode = &Child.second;
114     FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples();
115     if (!CalleeSamples)
116       continue;
117 
118     // Call site count is more reliable, so we look up the corresponding call
119     // target profile in caller's context profile to retrieve call site count.
120     uint64_t CalleeEntryCount = CalleeSamples->getHeadSamplesEstimate();
121     uint64_t CallsiteCount = 0;
122     LineLocation Callsite = CalleeNode->getCallSiteLoc();
123     if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) {
124       SampleRecord::CallTargetMap &TargetCounts = CallTargets.get();
125       auto It = TargetCounts.find(CalleeSamples->getName());
126       if (It != TargetCounts.end())
127         CallsiteCount = It->second;
128     }
129 
130     // TODO: call site and callee entry count should be mostly consistent, add
131     // check for that.
132     HasNewCandidate = true;
133     uint32_t CalleeSize = getFuncSize(CalleeNode);
134     CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount),
135                    CalleeSize);
136   }
137 
138   return HasNewCandidate;
139 }
140 
getFuncSize(const ContextTrieNode * ContextNode)141 uint32_t CSPreInliner::getFuncSize(const ContextTrieNode *ContextNode) {
142   if (UseContextCost)
143     return Binary.getFuncSizeForContext(ContextNode);
144 
145   return ContextNode->getFunctionSamples()->getBodySamples().size();
146 }
147 
shouldInline(ProfiledInlineCandidate & Candidate)148 bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) {
149   // If replay inline is requested, simply follow the inline decision of the
150   // profiled binary.
151   if (SamplePreInlineReplay)
152     return Candidate.CalleeSamples->getContext().hasAttribute(
153         ContextWasInlined);
154 
155   unsigned int SampleThreshold = SampleColdCallSiteThreshold;
156   uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(
157       (Summary->getDetailedSummary()));
158 
159   if (Candidate.CallsiteCount <= ColdCountThreshold)
160     SampleThreshold = SampleColdCallSiteThreshold;
161   else {
162     // Linearly adjust threshold based on normalized hotness, i.e, a value in
163     // [0,1]. Use 10% cutoff instead of the max count as the normalization
164     // upperbound for stability.
165     double NormalizationUpperBound =
166         ProfileSummaryBuilder::getEntryForPercentile(
167             Summary->getDetailedSummary(), 100000 /* 10% */)
168             .MinCount;
169     double NormalizationLowerBound = ColdCountThreshold;
170     double NormalizedHotness =
171         (Candidate.CallsiteCount - NormalizationLowerBound) /
172         (NormalizationUpperBound - NormalizationLowerBound);
173     if (NormalizedHotness > 1.0)
174       NormalizedHotness = 1.0;
175     // Add 1 to to ensure hot callsites get a non-zero threshold, which could
176     // happen when SampleColdCallSiteThreshold is 0. This is when we do not
177     // want any inlining for cold callsites.
178     SampleThreshold = SampleHotCallSiteThreshold * NormalizedHotness * 100 +
179                       SampleColdCallSiteThreshold + 1;
180   }
181 
182   return (Candidate.SizeCost < SampleThreshold);
183 }
184 
processFunction(const StringRef Name)185 void CSPreInliner::processFunction(const StringRef Name) {
186   FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name);
187   if (!FSamples)
188     return;
189 
190   unsigned FuncSize =
191       getFuncSize(ContextTracker.getContextNodeForProfile(FSamples));
192   unsigned FuncFinalSize = FuncSize;
193   unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit;
194   SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
195   SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
196 
197   LLVM_DEBUG(dbgs() << "Process " << Name
198                     << " for context-sensitive pre-inlining (pre-inline size: "
199                     << FuncSize << ", size limit: " << SizeLimit << ")\n");
200 
201   ProfiledCandidateQueue CQueue;
202   getInlineCandidates(CQueue, FSamples);
203 
204   while (!CQueue.empty() && FuncFinalSize < SizeLimit) {
205     ProfiledInlineCandidate Candidate = CQueue.top();
206     CQueue.pop();
207     bool ShouldInline = false;
208     if ((ShouldInline = shouldInline(Candidate))) {
209       // We mark context as inlined as the corresponding context profile
210       // won't be merged into that function's base profile.
211       ++PreInlNumCSInlined;
212       ContextTracker.markContextSamplesInlined(Candidate.CalleeSamples);
213       Candidate.CalleeSamples->getContext().setAttribute(
214           ContextShouldBeInlined);
215       FuncFinalSize += Candidate.SizeCost;
216       getInlineCandidates(CQueue, Candidate.CalleeSamples);
217     } else {
218       ++PreInlNumCSNotInlined;
219     }
220     LLVM_DEBUG(
221         dbgs() << (ShouldInline ? "  Inlined" : "  Outlined")
222                << " context profile for: "
223                << ContextTracker.getContextString(*Candidate.CalleeSamples)
224                << " (callee size: " << Candidate.SizeCost
225                << ", call count:" << Candidate.CallsiteCount << ")\n");
226   }
227 
228   if (!CQueue.empty()) {
229     if (SizeLimit == (unsigned)ProfileInlineLimitMax)
230       ++PreInlNumCSInlinedHitMaxLimit;
231     else if (SizeLimit == (unsigned)ProfileInlineLimitMin)
232       ++PreInlNumCSInlinedHitMinLimit;
233     else
234       ++PreInlNumCSInlinedHitGrowthLimit;
235   }
236 
237   LLVM_DEBUG({
238     if (!CQueue.empty())
239       dbgs() << "  Inline candidates ignored due to size limit (inliner "
240                 "original size: "
241              << FuncSize << ", inliner final size: " << FuncFinalSize
242              << ", size limit: " << SizeLimit << ")\n";
243 
244     while (!CQueue.empty()) {
245       ProfiledInlineCandidate Candidate = CQueue.top();
246       CQueue.pop();
247       bool WasInlined =
248           Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined);
249       dbgs() << "    "
250              << ContextTracker.getContextString(*Candidate.CalleeSamples)
251              << " (candidate size:" << Candidate.SizeCost
252              << ", call count: " << Candidate.CallsiteCount << ", previously "
253              << (WasInlined ? "inlined)\n" : "not inlined)\n");
254     }
255   });
256 }
257 
run()258 void CSPreInliner::run() {
259 #ifndef NDEBUG
260   auto printProfileNames = [](SampleContextTracker &ContextTracker,
261                               bool IsInput) {
262     uint32_t Size = 0;
263     for (auto *Node : ContextTracker) {
264       FunctionSamples *FSamples = Node->getFunctionSamples();
265       if (FSamples) {
266         Size++;
267         dbgs() << "  [" << ContextTracker.getContextString(Node) << "] "
268                << FSamples->getTotalSamples() << ":"
269                << FSamples->getHeadSamples() << "\n";
270       }
271     }
272     dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles ("
273            << Size << " total):\n";
274   };
275 #endif
276 
277   LLVM_DEBUG(printProfileNames(ContextTracker, true));
278 
279   // Execute global pre-inliner to estimate a global top-down inline
280   // decision and merge profiles accordingly. This helps with profile
281   // merge for ThinLTO otherwise we won't be able to merge profiles back
282   // to base profile across module/thin-backend boundaries.
283   // It also helps better compress context profile to control profile
284   // size, as we now only need context profile for functions going to
285   // be inlined.
286   for (StringRef FuncName : buildTopDownOrder()) {
287     processFunction(FuncName);
288   }
289 
290   // Not inlined context profiles are merged into its base, so we can
291   // trim out such profiles from the output.
292   for (auto *Node : ContextTracker) {
293     FunctionSamples *FProfile = Node->getFunctionSamples();
294     if (FProfile &&
295         (Node->getParentContext() != &ContextTracker.getRootContext() &&
296          !FProfile->getContext().hasState(InlinedContext))) {
297       Node->setFunctionSamples(nullptr);
298     }
299   }
300   FunctionSamples::ProfileIsPreInlined = true;
301 
302   LLVM_DEBUG(printProfileNames(ContextTracker, false));
303 }
304