xref: /netbsd-src/external/apache2/llvm/dist/llvm/tools/llvm-profgen/CSPreInliner.cpp (revision 82d56013d7b633d116a93943de88e08335357a7c)
1*82d56013Sjoerg //===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===//
2*82d56013Sjoerg //
3*82d56013Sjoerg // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*82d56013Sjoerg // See https://llvm.org/LICENSE.txt for license information.
5*82d56013Sjoerg // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*82d56013Sjoerg //
7*82d56013Sjoerg //===----------------------------------------------------------------------===//
8*82d56013Sjoerg 
9*82d56013Sjoerg #include "CSPreInliner.h"
10*82d56013Sjoerg #include "llvm/ADT/SCCIterator.h"
11*82d56013Sjoerg #include <cstdint>
12*82d56013Sjoerg #include <queue>
13*82d56013Sjoerg 
14*82d56013Sjoerg #define DEBUG_TYPE "cs-preinliner"
15*82d56013Sjoerg 
16*82d56013Sjoerg using namespace llvm;
17*82d56013Sjoerg using namespace sampleprof;
18*82d56013Sjoerg 
19*82d56013Sjoerg static cl::opt<bool> EnableCSPreInliner(
20*82d56013Sjoerg     "csspgo-preinliner", cl::Hidden, cl::init(false),
21*82d56013Sjoerg     cl::desc("Run a global pre-inliner to merge context profile based on "
22*82d56013Sjoerg              "estimated global top-down inline decisions"));
23*82d56013Sjoerg 
24*82d56013Sjoerg // The switches specify inline thresholds used in SampleProfileLoader inlining.
25*82d56013Sjoerg // TODO: the actual threshold to be tuned here because the size here is based
26*82d56013Sjoerg // on machine code not LLVM IR.
27*82d56013Sjoerg extern cl::opt<int> SampleHotCallSiteThreshold;
28*82d56013Sjoerg extern cl::opt<int> SampleColdCallSiteThreshold;
29*82d56013Sjoerg extern cl::opt<int> ProfileInlineGrowthLimit;
30*82d56013Sjoerg extern cl::opt<int> ProfileInlineLimitMin;
31*82d56013Sjoerg extern cl::opt<int> ProfileInlineLimitMax;
32*82d56013Sjoerg 
33*82d56013Sjoerg static cl::opt<bool> SamplePreInlineReplay(
34*82d56013Sjoerg     "csspgo-replay-preinline", cl::Hidden, cl::init(false),
35*82d56013Sjoerg     cl::desc(
36*82d56013Sjoerg         "Replay previous inlining and adjust context profile accordingly"));
37*82d56013Sjoerg 
CSPreInliner(StringMap<FunctionSamples> & Profiles,uint64_t HotThreshold,uint64_t ColdThreshold)38*82d56013Sjoerg CSPreInliner::CSPreInliner(StringMap<FunctionSamples> &Profiles,
39*82d56013Sjoerg                            uint64_t HotThreshold, uint64_t ColdThreshold)
40*82d56013Sjoerg     : ContextTracker(Profiles), ProfileMap(Profiles),
41*82d56013Sjoerg       HotCountThreshold(HotThreshold), ColdCountThreshold(ColdThreshold) {}
42*82d56013Sjoerg 
buildTopDownOrder()43*82d56013Sjoerg std::vector<StringRef> CSPreInliner::buildTopDownOrder() {
44*82d56013Sjoerg   std::vector<StringRef> Order;
45*82d56013Sjoerg   ProfiledCallGraph ProfiledCG(ContextTracker);
46*82d56013Sjoerg 
47*82d56013Sjoerg   // Now that we have a profiled call graph, construct top-down order
48*82d56013Sjoerg   // by building up SCC and reversing SCC order.
49*82d56013Sjoerg   scc_iterator<ProfiledCallGraph *> I = scc_begin(&ProfiledCG);
50*82d56013Sjoerg   while (!I.isAtEnd()) {
51*82d56013Sjoerg     for (ProfiledCallGraphNode *Node : *I) {
52*82d56013Sjoerg       if (Node != ProfiledCG.getEntryNode())
53*82d56013Sjoerg         Order.push_back(Node->Name);
54*82d56013Sjoerg     }
55*82d56013Sjoerg     ++I;
56*82d56013Sjoerg   }
57*82d56013Sjoerg   std::reverse(Order.begin(), Order.end());
58*82d56013Sjoerg 
59*82d56013Sjoerg   return Order;
60*82d56013Sjoerg }
61*82d56013Sjoerg 
getInlineCandidates(ProfiledCandidateQueue & CQueue,const FunctionSamples * CallerSamples)62*82d56013Sjoerg bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue,
63*82d56013Sjoerg                                        const FunctionSamples *CallerSamples) {
64*82d56013Sjoerg   assert(CallerSamples && "Expect non-null caller samples");
65*82d56013Sjoerg 
66*82d56013Sjoerg   // Ideally we want to consider everything a function calls, but as far as
67*82d56013Sjoerg   // context profile is concerned, only those frames that are children of
68*82d56013Sjoerg   // current one in the trie is relavent. So we walk the trie instead of call
69*82d56013Sjoerg   // targets from function profile.
70*82d56013Sjoerg   ContextTrieNode *CallerNode =
71*82d56013Sjoerg       ContextTracker.getContextFor(CallerSamples->getContext());
72*82d56013Sjoerg 
73*82d56013Sjoerg   bool HasNewCandidate = false;
74*82d56013Sjoerg   for (auto &Child : CallerNode->getAllChildContext()) {
75*82d56013Sjoerg     ContextTrieNode *CalleeNode = &Child.second;
76*82d56013Sjoerg     FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples();
77*82d56013Sjoerg     if (!CalleeSamples)
78*82d56013Sjoerg       continue;
79*82d56013Sjoerg 
80*82d56013Sjoerg     // Call site count is more reliable, so we look up the corresponding call
81*82d56013Sjoerg     // target profile in caller's context profile to retrieve call site count.
82*82d56013Sjoerg     uint64_t CalleeEntryCount = CalleeSamples->getEntrySamples();
83*82d56013Sjoerg     uint64_t CallsiteCount = 0;
84*82d56013Sjoerg     LineLocation Callsite = CalleeNode->getCallSiteLoc();
85*82d56013Sjoerg     if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) {
86*82d56013Sjoerg       SampleRecord::CallTargetMap &TargetCounts = CallTargets.get();
87*82d56013Sjoerg       auto It = TargetCounts.find(CalleeSamples->getName());
88*82d56013Sjoerg       if (It != TargetCounts.end())
89*82d56013Sjoerg         CallsiteCount = It->second;
90*82d56013Sjoerg     }
91*82d56013Sjoerg 
92*82d56013Sjoerg     // TODO: call site and callee entry count should be mostly consistent, add
93*82d56013Sjoerg     // check for that.
94*82d56013Sjoerg     HasNewCandidate = true;
95*82d56013Sjoerg     CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount));
96*82d56013Sjoerg   }
97*82d56013Sjoerg 
98*82d56013Sjoerg   return HasNewCandidate;
99*82d56013Sjoerg }
100*82d56013Sjoerg 
shouldInline(ProfiledInlineCandidate & Candidate)101*82d56013Sjoerg bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) {
102*82d56013Sjoerg   // If replay inline is requested, simply follow the inline decision of the
103*82d56013Sjoerg   // profiled binary.
104*82d56013Sjoerg   if (SamplePreInlineReplay)
105*82d56013Sjoerg     return Candidate.CalleeSamples->getContext().hasAttribute(
106*82d56013Sjoerg         ContextWasInlined);
107*82d56013Sjoerg 
108*82d56013Sjoerg   // Adjust threshold based on call site hotness, only do this for callsite
109*82d56013Sjoerg   // prioritized inliner because otherwise cost-benefit check is done earlier.
110*82d56013Sjoerg   unsigned int SampleThreshold = SampleColdCallSiteThreshold;
111*82d56013Sjoerg   if (Candidate.CallsiteCount > HotCountThreshold)
112*82d56013Sjoerg     SampleThreshold = SampleHotCallSiteThreshold;
113*82d56013Sjoerg 
114*82d56013Sjoerg   // TODO: for small cold functions, we may inlined them and we need to keep
115*82d56013Sjoerg   // context profile accordingly.
116*82d56013Sjoerg   if (Candidate.CallsiteCount < ColdCountThreshold)
117*82d56013Sjoerg     SampleThreshold = SampleColdCallSiteThreshold;
118*82d56013Sjoerg 
119*82d56013Sjoerg   return (Candidate.SizeCost < SampleThreshold);
120*82d56013Sjoerg }
121*82d56013Sjoerg 
processFunction(const StringRef Name)122*82d56013Sjoerg void CSPreInliner::processFunction(const StringRef Name) {
123*82d56013Sjoerg   LLVM_DEBUG(dbgs() << "Process " << Name
124*82d56013Sjoerg                     << " for context-sensitive pre-inlining\n");
125*82d56013Sjoerg 
126*82d56013Sjoerg   FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name);
127*82d56013Sjoerg   if (!FSamples)
128*82d56013Sjoerg     return;
129*82d56013Sjoerg 
130*82d56013Sjoerg   // Use the number of lines/probes as proxy for function size for now.
131*82d56013Sjoerg   // TODO: retrieve accurate size from dwarf or binary instead.
132*82d56013Sjoerg   unsigned FuncSize = FSamples->getBodySamples().size();
133*82d56013Sjoerg   unsigned FuncFinalSize = FuncSize;
134*82d56013Sjoerg   unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit;
135*82d56013Sjoerg   SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
136*82d56013Sjoerg   SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
137*82d56013Sjoerg 
138*82d56013Sjoerg   ProfiledCandidateQueue CQueue;
139*82d56013Sjoerg   getInlineCandidates(CQueue, FSamples);
140*82d56013Sjoerg 
141*82d56013Sjoerg   while (!CQueue.empty() && FuncFinalSize < SizeLimit) {
142*82d56013Sjoerg     ProfiledInlineCandidate Candidate = CQueue.top();
143*82d56013Sjoerg     CQueue.pop();
144*82d56013Sjoerg     bool ShouldInline = false;
145*82d56013Sjoerg     if ((ShouldInline = shouldInline(Candidate))) {
146*82d56013Sjoerg       // We mark context as inlined as the corresponding context profile
147*82d56013Sjoerg       // won't be merged into that function's base profile.
148*82d56013Sjoerg       ContextTracker.markContextSamplesInlined(Candidate.CalleeSamples);
149*82d56013Sjoerg       Candidate.CalleeSamples->getContext().setAttribute(
150*82d56013Sjoerg           ContextShouldBeInlined);
151*82d56013Sjoerg       FuncFinalSize += Candidate.SizeCost;
152*82d56013Sjoerg       getInlineCandidates(CQueue, Candidate.CalleeSamples);
153*82d56013Sjoerg     }
154*82d56013Sjoerg     LLVM_DEBUG(dbgs() << (ShouldInline ? "  Inlined" : "  Outlined")
155*82d56013Sjoerg                       << " context profile for: "
156*82d56013Sjoerg                       << Candidate.CalleeSamples->getNameWithContext()
157*82d56013Sjoerg                       << " (callee size: " << Candidate.SizeCost
158*82d56013Sjoerg                       << ", call count:" << Candidate.CallsiteCount << ")\n");
159*82d56013Sjoerg   }
160*82d56013Sjoerg 
161*82d56013Sjoerg   LLVM_DEBUG({
162*82d56013Sjoerg     if (!CQueue.empty())
163*82d56013Sjoerg       dbgs() << "  Inline candidates ignored due to size limit (inliner "
164*82d56013Sjoerg                 "original size: "
165*82d56013Sjoerg              << FuncSize << ", inliner final size: " << FuncFinalSize
166*82d56013Sjoerg              << ", size limit: " << SizeLimit << ")\n";
167*82d56013Sjoerg 
168*82d56013Sjoerg     while (!CQueue.empty()) {
169*82d56013Sjoerg       ProfiledInlineCandidate Candidate = CQueue.top();
170*82d56013Sjoerg       CQueue.pop();
171*82d56013Sjoerg       bool WasInlined =
172*82d56013Sjoerg           Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined);
173*82d56013Sjoerg       dbgs() << "    " << Candidate.CalleeSamples->getNameWithContext()
174*82d56013Sjoerg              << " (candidate size:" << Candidate.SizeCost
175*82d56013Sjoerg              << ", call count: " << Candidate.CallsiteCount << ", previously "
176*82d56013Sjoerg              << (WasInlined ? "inlined)\n" : "not inlined)\n");
177*82d56013Sjoerg     }
178*82d56013Sjoerg   });
179*82d56013Sjoerg }
180*82d56013Sjoerg 
run()181*82d56013Sjoerg void CSPreInliner::run() {
182*82d56013Sjoerg   if (!EnableCSPreInliner)
183*82d56013Sjoerg     return;
184*82d56013Sjoerg 
185*82d56013Sjoerg #ifndef NDEBUG
186*82d56013Sjoerg   auto printProfileNames = [](StringMap<FunctionSamples> &Profiles,
187*82d56013Sjoerg                               bool IsInput) {
188*82d56013Sjoerg     dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles ("
189*82d56013Sjoerg            << Profiles.size() << " total):\n";
190*82d56013Sjoerg     for (auto &It : Profiles) {
191*82d56013Sjoerg       const FunctionSamples &Samples = It.second;
192*82d56013Sjoerg       dbgs() << "  [" << Samples.getNameWithContext() << "] "
193*82d56013Sjoerg              << Samples.getTotalSamples() << ":" << Samples.getHeadSamples()
194*82d56013Sjoerg              << "\n";
195*82d56013Sjoerg     }
196*82d56013Sjoerg   };
197*82d56013Sjoerg #endif
198*82d56013Sjoerg 
199*82d56013Sjoerg   LLVM_DEBUG(printProfileNames(ProfileMap, true));
200*82d56013Sjoerg 
201*82d56013Sjoerg   // Execute global pre-inliner to estimate a global top-down inline
202*82d56013Sjoerg   // decision and merge profiles accordingly. This helps with profile
203*82d56013Sjoerg   // merge for ThinLTO otherwise we won't be able to merge profiles back
204*82d56013Sjoerg   // to base profile across module/thin-backend boundaries.
205*82d56013Sjoerg   // It also helps better compress context profile to control profile
206*82d56013Sjoerg   // size, as we now only need context profile for functions going to
207*82d56013Sjoerg   // be inlined.
208*82d56013Sjoerg   for (StringRef FuncName : buildTopDownOrder()) {
209*82d56013Sjoerg     processFunction(FuncName);
210*82d56013Sjoerg   }
211*82d56013Sjoerg 
212*82d56013Sjoerg   // Not inlined context profiles are merged into its base, so we can
213*82d56013Sjoerg   // trim out such profiles from the output.
214*82d56013Sjoerg   std::vector<StringRef> ProfilesToBeRemoved;
215*82d56013Sjoerg   for (auto &It : ProfileMap) {
216*82d56013Sjoerg     SampleContext Context = It.second.getContext();
217*82d56013Sjoerg     if (!Context.isBaseContext() && !Context.hasState(InlinedContext)) {
218*82d56013Sjoerg       assert(Context.hasState(MergedContext) &&
219*82d56013Sjoerg              "Not inlined context profile should be merged already");
220*82d56013Sjoerg       ProfilesToBeRemoved.push_back(It.first());
221*82d56013Sjoerg     }
222*82d56013Sjoerg   }
223*82d56013Sjoerg 
224*82d56013Sjoerg   for (StringRef ContextName : ProfilesToBeRemoved) {
225*82d56013Sjoerg     ProfileMap.erase(ContextName);
226*82d56013Sjoerg   }
227*82d56013Sjoerg 
228*82d56013Sjoerg   // Make sure ProfileMap's key is consistent with FunctionSamples' name.
229*82d56013Sjoerg   SampleContextTrimmer(ProfileMap).canonicalizeContextProfiles();
230*82d56013Sjoerg 
231*82d56013Sjoerg   LLVM_DEBUG(printProfileNames(ProfileMap, false));
232*82d56013Sjoerg }
233