1*82d56013Sjoerg //===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===//
2*82d56013Sjoerg //
3*82d56013Sjoerg // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*82d56013Sjoerg // See https://llvm.org/LICENSE.txt for license information.
5*82d56013Sjoerg // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*82d56013Sjoerg //
7*82d56013Sjoerg //===----------------------------------------------------------------------===//
8*82d56013Sjoerg
9*82d56013Sjoerg #include "CSPreInliner.h"
10*82d56013Sjoerg #include "llvm/ADT/SCCIterator.h"
11*82d56013Sjoerg #include <cstdint>
12*82d56013Sjoerg #include <queue>
13*82d56013Sjoerg
14*82d56013Sjoerg #define DEBUG_TYPE "cs-preinliner"
15*82d56013Sjoerg
16*82d56013Sjoerg using namespace llvm;
17*82d56013Sjoerg using namespace sampleprof;
18*82d56013Sjoerg
19*82d56013Sjoerg static cl::opt<bool> EnableCSPreInliner(
20*82d56013Sjoerg "csspgo-preinliner", cl::Hidden, cl::init(false),
21*82d56013Sjoerg cl::desc("Run a global pre-inliner to merge context profile based on "
22*82d56013Sjoerg "estimated global top-down inline decisions"));
23*82d56013Sjoerg
24*82d56013Sjoerg // The switches specify inline thresholds used in SampleProfileLoader inlining.
25*82d56013Sjoerg // TODO: the actual threshold to be tuned here because the size here is based
26*82d56013Sjoerg // on machine code not LLVM IR.
27*82d56013Sjoerg extern cl::opt<int> SampleHotCallSiteThreshold;
28*82d56013Sjoerg extern cl::opt<int> SampleColdCallSiteThreshold;
29*82d56013Sjoerg extern cl::opt<int> ProfileInlineGrowthLimit;
30*82d56013Sjoerg extern cl::opt<int> ProfileInlineLimitMin;
31*82d56013Sjoerg extern cl::opt<int> ProfileInlineLimitMax;
32*82d56013Sjoerg
33*82d56013Sjoerg static cl::opt<bool> SamplePreInlineReplay(
34*82d56013Sjoerg "csspgo-replay-preinline", cl::Hidden, cl::init(false),
35*82d56013Sjoerg cl::desc(
36*82d56013Sjoerg "Replay previous inlining and adjust context profile accordingly"));
37*82d56013Sjoerg
CSPreInliner(StringMap<FunctionSamples> & Profiles,uint64_t HotThreshold,uint64_t ColdThreshold)38*82d56013Sjoerg CSPreInliner::CSPreInliner(StringMap<FunctionSamples> &Profiles,
39*82d56013Sjoerg uint64_t HotThreshold, uint64_t ColdThreshold)
40*82d56013Sjoerg : ContextTracker(Profiles), ProfileMap(Profiles),
41*82d56013Sjoerg HotCountThreshold(HotThreshold), ColdCountThreshold(ColdThreshold) {}
42*82d56013Sjoerg
buildTopDownOrder()43*82d56013Sjoerg std::vector<StringRef> CSPreInliner::buildTopDownOrder() {
44*82d56013Sjoerg std::vector<StringRef> Order;
45*82d56013Sjoerg ProfiledCallGraph ProfiledCG(ContextTracker);
46*82d56013Sjoerg
47*82d56013Sjoerg // Now that we have a profiled call graph, construct top-down order
48*82d56013Sjoerg // by building up SCC and reversing SCC order.
49*82d56013Sjoerg scc_iterator<ProfiledCallGraph *> I = scc_begin(&ProfiledCG);
50*82d56013Sjoerg while (!I.isAtEnd()) {
51*82d56013Sjoerg for (ProfiledCallGraphNode *Node : *I) {
52*82d56013Sjoerg if (Node != ProfiledCG.getEntryNode())
53*82d56013Sjoerg Order.push_back(Node->Name);
54*82d56013Sjoerg }
55*82d56013Sjoerg ++I;
56*82d56013Sjoerg }
57*82d56013Sjoerg std::reverse(Order.begin(), Order.end());
58*82d56013Sjoerg
59*82d56013Sjoerg return Order;
60*82d56013Sjoerg }
61*82d56013Sjoerg
getInlineCandidates(ProfiledCandidateQueue & CQueue,const FunctionSamples * CallerSamples)62*82d56013Sjoerg bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue,
63*82d56013Sjoerg const FunctionSamples *CallerSamples) {
64*82d56013Sjoerg assert(CallerSamples && "Expect non-null caller samples");
65*82d56013Sjoerg
66*82d56013Sjoerg // Ideally we want to consider everything a function calls, but as far as
67*82d56013Sjoerg // context profile is concerned, only those frames that are children of
68*82d56013Sjoerg // current one in the trie is relavent. So we walk the trie instead of call
69*82d56013Sjoerg // targets from function profile.
70*82d56013Sjoerg ContextTrieNode *CallerNode =
71*82d56013Sjoerg ContextTracker.getContextFor(CallerSamples->getContext());
72*82d56013Sjoerg
73*82d56013Sjoerg bool HasNewCandidate = false;
74*82d56013Sjoerg for (auto &Child : CallerNode->getAllChildContext()) {
75*82d56013Sjoerg ContextTrieNode *CalleeNode = &Child.second;
76*82d56013Sjoerg FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples();
77*82d56013Sjoerg if (!CalleeSamples)
78*82d56013Sjoerg continue;
79*82d56013Sjoerg
80*82d56013Sjoerg // Call site count is more reliable, so we look up the corresponding call
81*82d56013Sjoerg // target profile in caller's context profile to retrieve call site count.
82*82d56013Sjoerg uint64_t CalleeEntryCount = CalleeSamples->getEntrySamples();
83*82d56013Sjoerg uint64_t CallsiteCount = 0;
84*82d56013Sjoerg LineLocation Callsite = CalleeNode->getCallSiteLoc();
85*82d56013Sjoerg if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) {
86*82d56013Sjoerg SampleRecord::CallTargetMap &TargetCounts = CallTargets.get();
87*82d56013Sjoerg auto It = TargetCounts.find(CalleeSamples->getName());
88*82d56013Sjoerg if (It != TargetCounts.end())
89*82d56013Sjoerg CallsiteCount = It->second;
90*82d56013Sjoerg }
91*82d56013Sjoerg
92*82d56013Sjoerg // TODO: call site and callee entry count should be mostly consistent, add
93*82d56013Sjoerg // check for that.
94*82d56013Sjoerg HasNewCandidate = true;
95*82d56013Sjoerg CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount));
96*82d56013Sjoerg }
97*82d56013Sjoerg
98*82d56013Sjoerg return HasNewCandidate;
99*82d56013Sjoerg }
100*82d56013Sjoerg
shouldInline(ProfiledInlineCandidate & Candidate)101*82d56013Sjoerg bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) {
102*82d56013Sjoerg // If replay inline is requested, simply follow the inline decision of the
103*82d56013Sjoerg // profiled binary.
104*82d56013Sjoerg if (SamplePreInlineReplay)
105*82d56013Sjoerg return Candidate.CalleeSamples->getContext().hasAttribute(
106*82d56013Sjoerg ContextWasInlined);
107*82d56013Sjoerg
108*82d56013Sjoerg // Adjust threshold based on call site hotness, only do this for callsite
109*82d56013Sjoerg // prioritized inliner because otherwise cost-benefit check is done earlier.
110*82d56013Sjoerg unsigned int SampleThreshold = SampleColdCallSiteThreshold;
111*82d56013Sjoerg if (Candidate.CallsiteCount > HotCountThreshold)
112*82d56013Sjoerg SampleThreshold = SampleHotCallSiteThreshold;
113*82d56013Sjoerg
114*82d56013Sjoerg // TODO: for small cold functions, we may inlined them and we need to keep
115*82d56013Sjoerg // context profile accordingly.
116*82d56013Sjoerg if (Candidate.CallsiteCount < ColdCountThreshold)
117*82d56013Sjoerg SampleThreshold = SampleColdCallSiteThreshold;
118*82d56013Sjoerg
119*82d56013Sjoerg return (Candidate.SizeCost < SampleThreshold);
120*82d56013Sjoerg }
121*82d56013Sjoerg
processFunction(const StringRef Name)122*82d56013Sjoerg void CSPreInliner::processFunction(const StringRef Name) {
123*82d56013Sjoerg LLVM_DEBUG(dbgs() << "Process " << Name
124*82d56013Sjoerg << " for context-sensitive pre-inlining\n");
125*82d56013Sjoerg
126*82d56013Sjoerg FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name);
127*82d56013Sjoerg if (!FSamples)
128*82d56013Sjoerg return;
129*82d56013Sjoerg
130*82d56013Sjoerg // Use the number of lines/probes as proxy for function size for now.
131*82d56013Sjoerg // TODO: retrieve accurate size from dwarf or binary instead.
132*82d56013Sjoerg unsigned FuncSize = FSamples->getBodySamples().size();
133*82d56013Sjoerg unsigned FuncFinalSize = FuncSize;
134*82d56013Sjoerg unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit;
135*82d56013Sjoerg SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
136*82d56013Sjoerg SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
137*82d56013Sjoerg
138*82d56013Sjoerg ProfiledCandidateQueue CQueue;
139*82d56013Sjoerg getInlineCandidates(CQueue, FSamples);
140*82d56013Sjoerg
141*82d56013Sjoerg while (!CQueue.empty() && FuncFinalSize < SizeLimit) {
142*82d56013Sjoerg ProfiledInlineCandidate Candidate = CQueue.top();
143*82d56013Sjoerg CQueue.pop();
144*82d56013Sjoerg bool ShouldInline = false;
145*82d56013Sjoerg if ((ShouldInline = shouldInline(Candidate))) {
146*82d56013Sjoerg // We mark context as inlined as the corresponding context profile
147*82d56013Sjoerg // won't be merged into that function's base profile.
148*82d56013Sjoerg ContextTracker.markContextSamplesInlined(Candidate.CalleeSamples);
149*82d56013Sjoerg Candidate.CalleeSamples->getContext().setAttribute(
150*82d56013Sjoerg ContextShouldBeInlined);
151*82d56013Sjoerg FuncFinalSize += Candidate.SizeCost;
152*82d56013Sjoerg getInlineCandidates(CQueue, Candidate.CalleeSamples);
153*82d56013Sjoerg }
154*82d56013Sjoerg LLVM_DEBUG(dbgs() << (ShouldInline ? " Inlined" : " Outlined")
155*82d56013Sjoerg << " context profile for: "
156*82d56013Sjoerg << Candidate.CalleeSamples->getNameWithContext()
157*82d56013Sjoerg << " (callee size: " << Candidate.SizeCost
158*82d56013Sjoerg << ", call count:" << Candidate.CallsiteCount << ")\n");
159*82d56013Sjoerg }
160*82d56013Sjoerg
161*82d56013Sjoerg LLVM_DEBUG({
162*82d56013Sjoerg if (!CQueue.empty())
163*82d56013Sjoerg dbgs() << " Inline candidates ignored due to size limit (inliner "
164*82d56013Sjoerg "original size: "
165*82d56013Sjoerg << FuncSize << ", inliner final size: " << FuncFinalSize
166*82d56013Sjoerg << ", size limit: " << SizeLimit << ")\n";
167*82d56013Sjoerg
168*82d56013Sjoerg while (!CQueue.empty()) {
169*82d56013Sjoerg ProfiledInlineCandidate Candidate = CQueue.top();
170*82d56013Sjoerg CQueue.pop();
171*82d56013Sjoerg bool WasInlined =
172*82d56013Sjoerg Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined);
173*82d56013Sjoerg dbgs() << " " << Candidate.CalleeSamples->getNameWithContext()
174*82d56013Sjoerg << " (candidate size:" << Candidate.SizeCost
175*82d56013Sjoerg << ", call count: " << Candidate.CallsiteCount << ", previously "
176*82d56013Sjoerg << (WasInlined ? "inlined)\n" : "not inlined)\n");
177*82d56013Sjoerg }
178*82d56013Sjoerg });
179*82d56013Sjoerg }
180*82d56013Sjoerg
run()181*82d56013Sjoerg void CSPreInliner::run() {
182*82d56013Sjoerg if (!EnableCSPreInliner)
183*82d56013Sjoerg return;
184*82d56013Sjoerg
185*82d56013Sjoerg #ifndef NDEBUG
186*82d56013Sjoerg auto printProfileNames = [](StringMap<FunctionSamples> &Profiles,
187*82d56013Sjoerg bool IsInput) {
188*82d56013Sjoerg dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles ("
189*82d56013Sjoerg << Profiles.size() << " total):\n";
190*82d56013Sjoerg for (auto &It : Profiles) {
191*82d56013Sjoerg const FunctionSamples &Samples = It.second;
192*82d56013Sjoerg dbgs() << " [" << Samples.getNameWithContext() << "] "
193*82d56013Sjoerg << Samples.getTotalSamples() << ":" << Samples.getHeadSamples()
194*82d56013Sjoerg << "\n";
195*82d56013Sjoerg }
196*82d56013Sjoerg };
197*82d56013Sjoerg #endif
198*82d56013Sjoerg
199*82d56013Sjoerg LLVM_DEBUG(printProfileNames(ProfileMap, true));
200*82d56013Sjoerg
201*82d56013Sjoerg // Execute global pre-inliner to estimate a global top-down inline
202*82d56013Sjoerg // decision and merge profiles accordingly. This helps with profile
203*82d56013Sjoerg // merge for ThinLTO otherwise we won't be able to merge profiles back
204*82d56013Sjoerg // to base profile across module/thin-backend boundaries.
205*82d56013Sjoerg // It also helps better compress context profile to control profile
206*82d56013Sjoerg // size, as we now only need context profile for functions going to
207*82d56013Sjoerg // be inlined.
208*82d56013Sjoerg for (StringRef FuncName : buildTopDownOrder()) {
209*82d56013Sjoerg processFunction(FuncName);
210*82d56013Sjoerg }
211*82d56013Sjoerg
212*82d56013Sjoerg // Not inlined context profiles are merged into its base, so we can
213*82d56013Sjoerg // trim out such profiles from the output.
214*82d56013Sjoerg std::vector<StringRef> ProfilesToBeRemoved;
215*82d56013Sjoerg for (auto &It : ProfileMap) {
216*82d56013Sjoerg SampleContext Context = It.second.getContext();
217*82d56013Sjoerg if (!Context.isBaseContext() && !Context.hasState(InlinedContext)) {
218*82d56013Sjoerg assert(Context.hasState(MergedContext) &&
219*82d56013Sjoerg "Not inlined context profile should be merged already");
220*82d56013Sjoerg ProfilesToBeRemoved.push_back(It.first());
221*82d56013Sjoerg }
222*82d56013Sjoerg }
223*82d56013Sjoerg
224*82d56013Sjoerg for (StringRef ContextName : ProfilesToBeRemoved) {
225*82d56013Sjoerg ProfileMap.erase(ContextName);
226*82d56013Sjoerg }
227*82d56013Sjoerg
228*82d56013Sjoerg // Make sure ProfileMap's key is consistent with FunctionSamples' name.
229*82d56013Sjoerg SampleContextTrimmer(ProfileMap).canonicalizeContextProfiles();
230*82d56013Sjoerg
231*82d56013Sjoerg LLVM_DEBUG(printProfileNames(ProfileMap, false));
232*82d56013Sjoerg }
233