xref: /netbsd-src/external/apache2/llvm/dist/llvm/tools/llvm-profgen/CSPreInliner.cpp (revision 82d56013d7b633d116a93943de88e08335357a7c)
1 //===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "CSPreInliner.h"
10 #include "llvm/ADT/SCCIterator.h"
11 #include <cstdint>
12 #include <queue>
13 
14 #define DEBUG_TYPE "cs-preinliner"
15 
16 using namespace llvm;
17 using namespace sampleprof;
18 
19 static cl::opt<bool> EnableCSPreInliner(
20     "csspgo-preinliner", cl::Hidden, cl::init(false),
21     cl::desc("Run a global pre-inliner to merge context profile based on "
22              "estimated global top-down inline decisions"));
23 
24 // The switches specify inline thresholds used in SampleProfileLoader inlining.
25 // TODO: the actual threshold to be tuned here because the size here is based
26 // on machine code not LLVM IR.
27 extern cl::opt<int> SampleHotCallSiteThreshold;
28 extern cl::opt<int> SampleColdCallSiteThreshold;
29 extern cl::opt<int> ProfileInlineGrowthLimit;
30 extern cl::opt<int> ProfileInlineLimitMin;
31 extern cl::opt<int> ProfileInlineLimitMax;
32 
33 static cl::opt<bool> SamplePreInlineReplay(
34     "csspgo-replay-preinline", cl::Hidden, cl::init(false),
35     cl::desc(
36         "Replay previous inlining and adjust context profile accordingly"));
37 
CSPreInliner(StringMap<FunctionSamples> & Profiles,uint64_t HotThreshold,uint64_t ColdThreshold)38 CSPreInliner::CSPreInliner(StringMap<FunctionSamples> &Profiles,
39                            uint64_t HotThreshold, uint64_t ColdThreshold)
40     : ContextTracker(Profiles), ProfileMap(Profiles),
41       HotCountThreshold(HotThreshold), ColdCountThreshold(ColdThreshold) {}
42 
buildTopDownOrder()43 std::vector<StringRef> CSPreInliner::buildTopDownOrder() {
44   std::vector<StringRef> Order;
45   ProfiledCallGraph ProfiledCG(ContextTracker);
46 
47   // Now that we have a profiled call graph, construct top-down order
48   // by building up SCC and reversing SCC order.
49   scc_iterator<ProfiledCallGraph *> I = scc_begin(&ProfiledCG);
50   while (!I.isAtEnd()) {
51     for (ProfiledCallGraphNode *Node : *I) {
52       if (Node != ProfiledCG.getEntryNode())
53         Order.push_back(Node->Name);
54     }
55     ++I;
56   }
57   std::reverse(Order.begin(), Order.end());
58 
59   return Order;
60 }
61 
getInlineCandidates(ProfiledCandidateQueue & CQueue,const FunctionSamples * CallerSamples)62 bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue,
63                                        const FunctionSamples *CallerSamples) {
64   assert(CallerSamples && "Expect non-null caller samples");
65 
66   // Ideally we want to consider everything a function calls, but as far as
67   // context profile is concerned, only those frames that are children of
68   // current one in the trie is relavent. So we walk the trie instead of call
69   // targets from function profile.
70   ContextTrieNode *CallerNode =
71       ContextTracker.getContextFor(CallerSamples->getContext());
72 
73   bool HasNewCandidate = false;
74   for (auto &Child : CallerNode->getAllChildContext()) {
75     ContextTrieNode *CalleeNode = &Child.second;
76     FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples();
77     if (!CalleeSamples)
78       continue;
79 
80     // Call site count is more reliable, so we look up the corresponding call
81     // target profile in caller's context profile to retrieve call site count.
82     uint64_t CalleeEntryCount = CalleeSamples->getEntrySamples();
83     uint64_t CallsiteCount = 0;
84     LineLocation Callsite = CalleeNode->getCallSiteLoc();
85     if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) {
86       SampleRecord::CallTargetMap &TargetCounts = CallTargets.get();
87       auto It = TargetCounts.find(CalleeSamples->getName());
88       if (It != TargetCounts.end())
89         CallsiteCount = It->second;
90     }
91 
92     // TODO: call site and callee entry count should be mostly consistent, add
93     // check for that.
94     HasNewCandidate = true;
95     CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount));
96   }
97 
98   return HasNewCandidate;
99 }
100 
shouldInline(ProfiledInlineCandidate & Candidate)101 bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) {
102   // If replay inline is requested, simply follow the inline decision of the
103   // profiled binary.
104   if (SamplePreInlineReplay)
105     return Candidate.CalleeSamples->getContext().hasAttribute(
106         ContextWasInlined);
107 
108   // Adjust threshold based on call site hotness, only do this for callsite
109   // prioritized inliner because otherwise cost-benefit check is done earlier.
110   unsigned int SampleThreshold = SampleColdCallSiteThreshold;
111   if (Candidate.CallsiteCount > HotCountThreshold)
112     SampleThreshold = SampleHotCallSiteThreshold;
113 
114   // TODO: for small cold functions, we may inlined them and we need to keep
115   // context profile accordingly.
116   if (Candidate.CallsiteCount < ColdCountThreshold)
117     SampleThreshold = SampleColdCallSiteThreshold;
118 
119   return (Candidate.SizeCost < SampleThreshold);
120 }
121 
processFunction(const StringRef Name)122 void CSPreInliner::processFunction(const StringRef Name) {
123   LLVM_DEBUG(dbgs() << "Process " << Name
124                     << " for context-sensitive pre-inlining\n");
125 
126   FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name);
127   if (!FSamples)
128     return;
129 
130   // Use the number of lines/probes as proxy for function size for now.
131   // TODO: retrieve accurate size from dwarf or binary instead.
132   unsigned FuncSize = FSamples->getBodySamples().size();
133   unsigned FuncFinalSize = FuncSize;
134   unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit;
135   SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
136   SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
137 
138   ProfiledCandidateQueue CQueue;
139   getInlineCandidates(CQueue, FSamples);
140 
141   while (!CQueue.empty() && FuncFinalSize < SizeLimit) {
142     ProfiledInlineCandidate Candidate = CQueue.top();
143     CQueue.pop();
144     bool ShouldInline = false;
145     if ((ShouldInline = shouldInline(Candidate))) {
146       // We mark context as inlined as the corresponding context profile
147       // won't be merged into that function's base profile.
148       ContextTracker.markContextSamplesInlined(Candidate.CalleeSamples);
149       Candidate.CalleeSamples->getContext().setAttribute(
150           ContextShouldBeInlined);
151       FuncFinalSize += Candidate.SizeCost;
152       getInlineCandidates(CQueue, Candidate.CalleeSamples);
153     }
154     LLVM_DEBUG(dbgs() << (ShouldInline ? "  Inlined" : "  Outlined")
155                       << " context profile for: "
156                       << Candidate.CalleeSamples->getNameWithContext()
157                       << " (callee size: " << Candidate.SizeCost
158                       << ", call count:" << Candidate.CallsiteCount << ")\n");
159   }
160 
161   LLVM_DEBUG({
162     if (!CQueue.empty())
163       dbgs() << "  Inline candidates ignored due to size limit (inliner "
164                 "original size: "
165              << FuncSize << ", inliner final size: " << FuncFinalSize
166              << ", size limit: " << SizeLimit << ")\n";
167 
168     while (!CQueue.empty()) {
169       ProfiledInlineCandidate Candidate = CQueue.top();
170       CQueue.pop();
171       bool WasInlined =
172           Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined);
173       dbgs() << "    " << Candidate.CalleeSamples->getNameWithContext()
174              << " (candidate size:" << Candidate.SizeCost
175              << ", call count: " << Candidate.CallsiteCount << ", previously "
176              << (WasInlined ? "inlined)\n" : "not inlined)\n");
177     }
178   });
179 }
180 
run()181 void CSPreInliner::run() {
182   if (!EnableCSPreInliner)
183     return;
184 
185 #ifndef NDEBUG
186   auto printProfileNames = [](StringMap<FunctionSamples> &Profiles,
187                               bool IsInput) {
188     dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles ("
189            << Profiles.size() << " total):\n";
190     for (auto &It : Profiles) {
191       const FunctionSamples &Samples = It.second;
192       dbgs() << "  [" << Samples.getNameWithContext() << "] "
193              << Samples.getTotalSamples() << ":" << Samples.getHeadSamples()
194              << "\n";
195     }
196   };
197 #endif
198 
199   LLVM_DEBUG(printProfileNames(ProfileMap, true));
200 
201   // Execute global pre-inliner to estimate a global top-down inline
202   // decision and merge profiles accordingly. This helps with profile
203   // merge for ThinLTO otherwise we won't be able to merge profiles back
204   // to base profile across module/thin-backend boundaries.
205   // It also helps better compress context profile to control profile
206   // size, as we now only need context profile for functions going to
207   // be inlined.
208   for (StringRef FuncName : buildTopDownOrder()) {
209     processFunction(FuncName);
210   }
211 
212   // Not inlined context profiles are merged into its base, so we can
213   // trim out such profiles from the output.
214   std::vector<StringRef> ProfilesToBeRemoved;
215   for (auto &It : ProfileMap) {
216     SampleContext Context = It.second.getContext();
217     if (!Context.isBaseContext() && !Context.hasState(InlinedContext)) {
218       assert(Context.hasState(MergedContext) &&
219              "Not inlined context profile should be merged already");
220       ProfilesToBeRemoved.push_back(It.first());
221     }
222   }
223 
224   for (StringRef ContextName : ProfilesToBeRemoved) {
225     ProfileMap.erase(ContextName);
226   }
227 
228   // Make sure ProfileMap's key is consistent with FunctionSamples' name.
229   SampleContextTrimmer(ProfileMap).canonicalizeContextProfiles();
230 
231   LLVM_DEBUG(printProfileNames(ProfileMap, false));
232 }
233