1 //===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "CSPreInliner.h"
10 #include "llvm/ADT/SCCIterator.h"
11 #include <cstdint>
12 #include <queue>
13
14 #define DEBUG_TYPE "cs-preinliner"
15
16 using namespace llvm;
17 using namespace sampleprof;
18
19 static cl::opt<bool> EnableCSPreInliner(
20 "csspgo-preinliner", cl::Hidden, cl::init(false),
21 cl::desc("Run a global pre-inliner to merge context profile based on "
22 "estimated global top-down inline decisions"));
23
24 // The switches specify inline thresholds used in SampleProfileLoader inlining.
25 // TODO: the actual threshold to be tuned here because the size here is based
26 // on machine code not LLVM IR.
27 extern cl::opt<int> SampleHotCallSiteThreshold;
28 extern cl::opt<int> SampleColdCallSiteThreshold;
29 extern cl::opt<int> ProfileInlineGrowthLimit;
30 extern cl::opt<int> ProfileInlineLimitMin;
31 extern cl::opt<int> ProfileInlineLimitMax;
32
33 static cl::opt<bool> SamplePreInlineReplay(
34 "csspgo-replay-preinline", cl::Hidden, cl::init(false),
35 cl::desc(
36 "Replay previous inlining and adjust context profile accordingly"));
37
CSPreInliner(StringMap<FunctionSamples> & Profiles,uint64_t HotThreshold,uint64_t ColdThreshold)38 CSPreInliner::CSPreInliner(StringMap<FunctionSamples> &Profiles,
39 uint64_t HotThreshold, uint64_t ColdThreshold)
40 : ContextTracker(Profiles), ProfileMap(Profiles),
41 HotCountThreshold(HotThreshold), ColdCountThreshold(ColdThreshold) {}
42
buildTopDownOrder()43 std::vector<StringRef> CSPreInliner::buildTopDownOrder() {
44 std::vector<StringRef> Order;
45 ProfiledCallGraph ProfiledCG(ContextTracker);
46
47 // Now that we have a profiled call graph, construct top-down order
48 // by building up SCC and reversing SCC order.
49 scc_iterator<ProfiledCallGraph *> I = scc_begin(&ProfiledCG);
50 while (!I.isAtEnd()) {
51 for (ProfiledCallGraphNode *Node : *I) {
52 if (Node != ProfiledCG.getEntryNode())
53 Order.push_back(Node->Name);
54 }
55 ++I;
56 }
57 std::reverse(Order.begin(), Order.end());
58
59 return Order;
60 }
61
getInlineCandidates(ProfiledCandidateQueue & CQueue,const FunctionSamples * CallerSamples)62 bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue,
63 const FunctionSamples *CallerSamples) {
64 assert(CallerSamples && "Expect non-null caller samples");
65
66 // Ideally we want to consider everything a function calls, but as far as
67 // context profile is concerned, only those frames that are children of
68 // current one in the trie is relavent. So we walk the trie instead of call
69 // targets from function profile.
70 ContextTrieNode *CallerNode =
71 ContextTracker.getContextFor(CallerSamples->getContext());
72
73 bool HasNewCandidate = false;
74 for (auto &Child : CallerNode->getAllChildContext()) {
75 ContextTrieNode *CalleeNode = &Child.second;
76 FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples();
77 if (!CalleeSamples)
78 continue;
79
80 // Call site count is more reliable, so we look up the corresponding call
81 // target profile in caller's context profile to retrieve call site count.
82 uint64_t CalleeEntryCount = CalleeSamples->getEntrySamples();
83 uint64_t CallsiteCount = 0;
84 LineLocation Callsite = CalleeNode->getCallSiteLoc();
85 if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) {
86 SampleRecord::CallTargetMap &TargetCounts = CallTargets.get();
87 auto It = TargetCounts.find(CalleeSamples->getName());
88 if (It != TargetCounts.end())
89 CallsiteCount = It->second;
90 }
91
92 // TODO: call site and callee entry count should be mostly consistent, add
93 // check for that.
94 HasNewCandidate = true;
95 CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount));
96 }
97
98 return HasNewCandidate;
99 }
100
shouldInline(ProfiledInlineCandidate & Candidate)101 bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) {
102 // If replay inline is requested, simply follow the inline decision of the
103 // profiled binary.
104 if (SamplePreInlineReplay)
105 return Candidate.CalleeSamples->getContext().hasAttribute(
106 ContextWasInlined);
107
108 // Adjust threshold based on call site hotness, only do this for callsite
109 // prioritized inliner because otherwise cost-benefit check is done earlier.
110 unsigned int SampleThreshold = SampleColdCallSiteThreshold;
111 if (Candidate.CallsiteCount > HotCountThreshold)
112 SampleThreshold = SampleHotCallSiteThreshold;
113
114 // TODO: for small cold functions, we may inlined them and we need to keep
115 // context profile accordingly.
116 if (Candidate.CallsiteCount < ColdCountThreshold)
117 SampleThreshold = SampleColdCallSiteThreshold;
118
119 return (Candidate.SizeCost < SampleThreshold);
120 }
121
processFunction(const StringRef Name)122 void CSPreInliner::processFunction(const StringRef Name) {
123 LLVM_DEBUG(dbgs() << "Process " << Name
124 << " for context-sensitive pre-inlining\n");
125
126 FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name);
127 if (!FSamples)
128 return;
129
130 // Use the number of lines/probes as proxy for function size for now.
131 // TODO: retrieve accurate size from dwarf or binary instead.
132 unsigned FuncSize = FSamples->getBodySamples().size();
133 unsigned FuncFinalSize = FuncSize;
134 unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit;
135 SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
136 SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
137
138 ProfiledCandidateQueue CQueue;
139 getInlineCandidates(CQueue, FSamples);
140
141 while (!CQueue.empty() && FuncFinalSize < SizeLimit) {
142 ProfiledInlineCandidate Candidate = CQueue.top();
143 CQueue.pop();
144 bool ShouldInline = false;
145 if ((ShouldInline = shouldInline(Candidate))) {
146 // We mark context as inlined as the corresponding context profile
147 // won't be merged into that function's base profile.
148 ContextTracker.markContextSamplesInlined(Candidate.CalleeSamples);
149 Candidate.CalleeSamples->getContext().setAttribute(
150 ContextShouldBeInlined);
151 FuncFinalSize += Candidate.SizeCost;
152 getInlineCandidates(CQueue, Candidate.CalleeSamples);
153 }
154 LLVM_DEBUG(dbgs() << (ShouldInline ? " Inlined" : " Outlined")
155 << " context profile for: "
156 << Candidate.CalleeSamples->getNameWithContext()
157 << " (callee size: " << Candidate.SizeCost
158 << ", call count:" << Candidate.CallsiteCount << ")\n");
159 }
160
161 LLVM_DEBUG({
162 if (!CQueue.empty())
163 dbgs() << " Inline candidates ignored due to size limit (inliner "
164 "original size: "
165 << FuncSize << ", inliner final size: " << FuncFinalSize
166 << ", size limit: " << SizeLimit << ")\n";
167
168 while (!CQueue.empty()) {
169 ProfiledInlineCandidate Candidate = CQueue.top();
170 CQueue.pop();
171 bool WasInlined =
172 Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined);
173 dbgs() << " " << Candidate.CalleeSamples->getNameWithContext()
174 << " (candidate size:" << Candidate.SizeCost
175 << ", call count: " << Candidate.CallsiteCount << ", previously "
176 << (WasInlined ? "inlined)\n" : "not inlined)\n");
177 }
178 });
179 }
180
run()181 void CSPreInliner::run() {
182 if (!EnableCSPreInliner)
183 return;
184
185 #ifndef NDEBUG
186 auto printProfileNames = [](StringMap<FunctionSamples> &Profiles,
187 bool IsInput) {
188 dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles ("
189 << Profiles.size() << " total):\n";
190 for (auto &It : Profiles) {
191 const FunctionSamples &Samples = It.second;
192 dbgs() << " [" << Samples.getNameWithContext() << "] "
193 << Samples.getTotalSamples() << ":" << Samples.getHeadSamples()
194 << "\n";
195 }
196 };
197 #endif
198
199 LLVM_DEBUG(printProfileNames(ProfileMap, true));
200
201 // Execute global pre-inliner to estimate a global top-down inline
202 // decision and merge profiles accordingly. This helps with profile
203 // merge for ThinLTO otherwise we won't be able to merge profiles back
204 // to base profile across module/thin-backend boundaries.
205 // It also helps better compress context profile to control profile
206 // size, as we now only need context profile for functions going to
207 // be inlined.
208 for (StringRef FuncName : buildTopDownOrder()) {
209 processFunction(FuncName);
210 }
211
212 // Not inlined context profiles are merged into its base, so we can
213 // trim out such profiles from the output.
214 std::vector<StringRef> ProfilesToBeRemoved;
215 for (auto &It : ProfileMap) {
216 SampleContext Context = It.second.getContext();
217 if (!Context.isBaseContext() && !Context.hasState(InlinedContext)) {
218 assert(Context.hasState(MergedContext) &&
219 "Not inlined context profile should be merged already");
220 ProfilesToBeRemoved.push_back(It.first());
221 }
222 }
223
224 for (StringRef ContextName : ProfilesToBeRemoved) {
225 ProfileMap.erase(ContextName);
226 }
227
228 // Make sure ProfileMap's key is consistent with FunctionSamples' name.
229 SampleContextTrimmer(ProfileMap).canonicalizeContextProfiles();
230
231 LLVM_DEBUG(printProfileNames(ProfileMap, false));
232 }
233