130b02323SWenlei He //===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===//
230b02323SWenlei He //
330b02323SWenlei He // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
430b02323SWenlei He // See https://llvm.org/LICENSE.txt for license information.
530b02323SWenlei He // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
630b02323SWenlei He //
730b02323SWenlei He //===----------------------------------------------------------------------===//
830b02323SWenlei He
930b02323SWenlei He #include "CSPreInliner.h"
10eca03d27SWenlei He #include "ProfiledBinary.h"
1130b02323SWenlei He #include "llvm/ADT/SCCIterator.h"
12f10004e7SWenlei He #include "llvm/ADT/Statistic.h"
13db29f437Sserge-sans-paille #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
14e7247f10STom Stellard #include "llvm/Transforms/IPO/SampleProfile.h"
1530b02323SWenlei He #include <cstdint>
1630b02323SWenlei He #include <queue>
1730b02323SWenlei He
1830b02323SWenlei He #define DEBUG_TYPE "cs-preinliner"
1930b02323SWenlei He
2030b02323SWenlei He using namespace llvm;
2130b02323SWenlei He using namespace sampleprof;
2230b02323SWenlei He
23f10004e7SWenlei He STATISTIC(PreInlNumCSInlined,
24f10004e7SWenlei He "Number of functions inlined with context sensitive profile");
25f10004e7SWenlei He STATISTIC(PreInlNumCSNotInlined,
26f10004e7SWenlei He "Number of functions not inlined with context sensitive profile");
27f10004e7SWenlei He STATISTIC(PreInlNumCSInlinedHitMinLimit,
28f10004e7SWenlei He "Number of functions with FDO inline stopped due to min size limit");
29f10004e7SWenlei He STATISTIC(PreInlNumCSInlinedHitMaxLimit,
30f10004e7SWenlei He "Number of functions with FDO inline stopped due to max size limit");
31f10004e7SWenlei He STATISTIC(
32f10004e7SWenlei He PreInlNumCSInlinedHitGrowthLimit,
33f10004e7SWenlei He "Number of functions with FDO inline stopped due to growth size limit");
34f10004e7SWenlei He
3530b02323SWenlei He // The switches specify inline thresholds used in SampleProfileLoader inlining.
3630b02323SWenlei He // TODO: the actual threshold to be tuned here because the size here is based
3730b02323SWenlei He // on machine code not LLVM IR.
381e692113SFangrui Song namespace llvm {
39eca03d27SWenlei He cl::opt<bool> EnableCSPreInliner(
40f6f0409fSWenlei He "csspgo-preinliner", cl::Hidden, cl::init(true),
41eca03d27SWenlei He cl::desc("Run a global pre-inliner to merge context profile based on "
42eca03d27SWenlei He "estimated global top-down inline decisions"));
43eca03d27SWenlei He
44eca03d27SWenlei He cl::opt<bool> UseContextCostForPreInliner(
45446e2162SWenlei He "use-context-cost-for-preinliner", cl::Hidden, cl::init(true),
46eca03d27SWenlei He cl::desc("Use context-sensitive byte size cost for preinliner decisions"));
471e692113SFangrui Song } // namespace llvm
48eca03d27SWenlei He
4930b02323SWenlei He static cl::opt<bool> SamplePreInlineReplay(
5030b02323SWenlei He "csspgo-replay-preinline", cl::Hidden, cl::init(false),
5130b02323SWenlei He cl::desc(
5230b02323SWenlei He "Replay previous inlining and adjust context profile accordingly"));
5330b02323SWenlei He
545cbcaf16SHongtao Yu static cl::opt<int> CSPreinlMultiplierForPrevInl(
555cbcaf16SHongtao Yu "csspgo-preinliner-multiplier-for-previous-inlining", cl::Hidden,
565cbcaf16SHongtao Yu cl::init(100),
575cbcaf16SHongtao Yu cl::desc(
585cbcaf16SHongtao Yu "Multiplier to bump up callsite threshold for previous inlining."));
595cbcaf16SHongtao Yu
CSPreInliner(SampleContextTracker & Tracker,ProfiledBinary & Binary,ProfileSummary * Summary)607e86b13cSwlei CSPreInliner::CSPreInliner(SampleContextTracker &Tracker,
617e86b13cSwlei ProfiledBinary &Binary, ProfileSummary *Summary)
627ca80300SHongtao Yu : UseContextCost(UseContextCostForPreInliner),
637ca80300SHongtao Yu // TODO: Pass in a guid-to-name map in order for
647ca80300SHongtao Yu // ContextTracker.getFuncNameFor to work, if `Profiles` can have md5 codes
657ca80300SHongtao Yu // as their profile context.
667e86b13cSwlei ContextTracker(Tracker), Binary(Binary), Summary(Summary) {
67f6f0409fSWenlei He // Set default preinliner hot/cold call site threshold tuned with CSSPGO.
68f6f0409fSWenlei He // for good performance with reasonable profile size.
69f6f0409fSWenlei He if (!SampleHotCallSiteThreshold.getNumOccurrences())
70f6f0409fSWenlei He SampleHotCallSiteThreshold = 1500;
71f6f0409fSWenlei He if (!SampleColdCallSiteThreshold.getNumOccurrences())
72f6f0409fSWenlei He SampleColdCallSiteThreshold = 0;
737d293744SHongtao Yu if (!ProfileInlineLimitMax.getNumOccurrences())
7439eb1c61SHongtao Yu ProfileInlineLimitMax = 50000;
75f6f0409fSWenlei He }
7630b02323SWenlei He
buildTopDownOrder()77ef0e0adcSWilliam Junda Huang std::vector<FunctionId> CSPreInliner::buildTopDownOrder() {
78ef0e0adcSWilliam Junda Huang std::vector<FunctionId> Order;
795c2ae37bSHongtao Yu // Trim cold edges to get a more stable call graph. This allows for a more
805c2ae37bSHongtao Yu // stable top-down order which in turns helps the stablity of the generated
815c2ae37bSHongtao Yu // profile from run to run.
825c2ae37bSHongtao Yu uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(
835c2ae37bSHongtao Yu (Summary->getDetailedSummary()));
845c2ae37bSHongtao Yu ProfiledCallGraph ProfiledCG(ContextTracker, ColdCountThreshold);
8530b02323SWenlei He
8630b02323SWenlei He // Now that we have a profiled call graph, construct top-down order
8730b02323SWenlei He // by building up SCC and reversing SCC order.
8830b02323SWenlei He scc_iterator<ProfiledCallGraph *> I = scc_begin(&ProfiledCG);
8930b02323SWenlei He while (!I.isAtEnd()) {
90bf317f66SHongtao Yu auto Range = *I;
91bf317f66SHongtao Yu if (SortProfiledSCC) {
92bf317f66SHongtao Yu // Sort nodes in one SCC based on callsite hotness.
93bf317f66SHongtao Yu scc_member_iterator<ProfiledCallGraph *> SI(*I);
94bf317f66SHongtao Yu Range = *SI;
95bf317f66SHongtao Yu }
96bf317f66SHongtao Yu for (auto *Node : Range) {
9730b02323SWenlei He if (Node != ProfiledCG.getEntryNode())
9830b02323SWenlei He Order.push_back(Node->Name);
9930b02323SWenlei He }
10030b02323SWenlei He ++I;
10130b02323SWenlei He }
10230b02323SWenlei He std::reverse(Order.begin(), Order.end());
10330b02323SWenlei He
10430b02323SWenlei He return Order;
10530b02323SWenlei He }
10630b02323SWenlei He
getInlineCandidates(ProfiledCandidateQueue & CQueue,const FunctionSamples * CallerSamples)10730b02323SWenlei He bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue,
10830b02323SWenlei He const FunctionSamples *CallerSamples) {
10930b02323SWenlei He assert(CallerSamples && "Expect non-null caller samples");
11030b02323SWenlei He
11130b02323SWenlei He // Ideally we want to consider everything a function calls, but as far as
11230b02323SWenlei He // context profile is concerned, only those frames that are children of
11330b02323SWenlei He // current one in the trie is relavent. So we walk the trie instead of call
11430b02323SWenlei He // targets from function profile.
11530b02323SWenlei He ContextTrieNode *CallerNode =
1167e86b13cSwlei ContextTracker.getContextNodeForProfile(CallerSamples);
11730b02323SWenlei He
11830b02323SWenlei He bool HasNewCandidate = false;
11930b02323SWenlei He for (auto &Child : CallerNode->getAllChildContext()) {
12030b02323SWenlei He ContextTrieNode *CalleeNode = &Child.second;
12130b02323SWenlei He FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples();
12230b02323SWenlei He if (!CalleeSamples)
12330b02323SWenlei He continue;
12430b02323SWenlei He
12530b02323SWenlei He // Call site count is more reliable, so we look up the corresponding call
12630b02323SWenlei He // target profile in caller's context profile to retrieve call site count.
1277b81a81dSMircea Trofin uint64_t CalleeEntryCount = CalleeSamples->getHeadSamplesEstimate();
12830b02323SWenlei He uint64_t CallsiteCount = 0;
12930b02323SWenlei He LineLocation Callsite = CalleeNode->getCallSiteLoc();
13030b02323SWenlei He if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) {
131*9423e459SBenjamin Kramer auto It = CallTargets->find(CalleeSamples->getFunction());
132*9423e459SBenjamin Kramer if (It != CallTargets->end())
13330b02323SWenlei He CallsiteCount = It->second;
13430b02323SWenlei He }
13530b02323SWenlei He
13630b02323SWenlei He // TODO: call site and callee entry count should be mostly consistent, add
13730b02323SWenlei He // check for that.
13830b02323SWenlei He HasNewCandidate = true;
1397e86b13cSwlei uint32_t CalleeSize = getFuncSize(CalleeNode);
140eca03d27SWenlei He CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount),
141eca03d27SWenlei He CalleeSize);
14230b02323SWenlei He }
14330b02323SWenlei He
14430b02323SWenlei He return HasNewCandidate;
14530b02323SWenlei He }
14630b02323SWenlei He
getFuncSize(const ContextTrieNode * ContextNode)1477e86b13cSwlei uint32_t CSPreInliner::getFuncSize(const ContextTrieNode *ContextNode) {
1487e86b13cSwlei if (UseContextCost)
1497e86b13cSwlei return Binary.getFuncSizeForContext(ContextNode);
150eca03d27SWenlei He
1517e86b13cSwlei return ContextNode->getFunctionSamples()->getBodySamples().size();
152eca03d27SWenlei He }
153eca03d27SWenlei He
shouldInline(ProfiledInlineCandidate & Candidate)15430b02323SWenlei He bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) {
1555cbcaf16SHongtao Yu bool WasInlined =
1565cbcaf16SHongtao Yu Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined);
15730b02323SWenlei He // If replay inline is requested, simply follow the inline decision of the
15830b02323SWenlei He // profiled binary.
15930b02323SWenlei He if (SamplePreInlineReplay)
1605cbcaf16SHongtao Yu return WasInlined;
16130b02323SWenlei He
16230b02323SWenlei He unsigned int SampleThreshold = SampleColdCallSiteThreshold;
163a4190037SHongtao Yu uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(
164a4190037SHongtao Yu (Summary->getDetailedSummary()));
16530b02323SWenlei He
166a4190037SHongtao Yu if (Candidate.CallsiteCount <= ColdCountThreshold)
16730b02323SWenlei He SampleThreshold = SampleColdCallSiteThreshold;
168a4190037SHongtao Yu else {
169a4190037SHongtao Yu // Linearly adjust threshold based on normalized hotness, i.e, a value in
170a4190037SHongtao Yu // [0,1]. Use 10% cutoff instead of the max count as the normalization
171a4190037SHongtao Yu // upperbound for stability.
172a4190037SHongtao Yu double NormalizationUpperBound =
173a4190037SHongtao Yu ProfileSummaryBuilder::getEntryForPercentile(
174a4190037SHongtao Yu Summary->getDetailedSummary(), 100000 /* 10% */)
175a4190037SHongtao Yu .MinCount;
176a4190037SHongtao Yu double NormalizationLowerBound = ColdCountThreshold;
177a4190037SHongtao Yu double NormalizedHotness =
178a4190037SHongtao Yu (Candidate.CallsiteCount - NormalizationLowerBound) /
179a4190037SHongtao Yu (NormalizationUpperBound - NormalizationLowerBound);
180a4190037SHongtao Yu if (NormalizedHotness > 1.0)
181a4190037SHongtao Yu NormalizedHotness = 1.0;
182111fcb0dSFangrui Song // Add 1 to ensure hot callsites get a non-zero threshold, which could
183a4190037SHongtao Yu // happen when SampleColdCallSiteThreshold is 0. This is when we do not
184a4190037SHongtao Yu // want any inlining for cold callsites.
185a4190037SHongtao Yu SampleThreshold = SampleHotCallSiteThreshold * NormalizedHotness * 100 +
186a4190037SHongtao Yu SampleColdCallSiteThreshold + 1;
1875cbcaf16SHongtao Yu // Bump up the threshold to favor previous compiler inline decision. The
1885cbcaf16SHongtao Yu // compiler has more insight and knowledge about functions based on their IR
1895cbcaf16SHongtao Yu // and attribures and should be able to make a more reasonable inline
1905cbcaf16SHongtao Yu // decision.
1915cbcaf16SHongtao Yu if (WasInlined)
1925cbcaf16SHongtao Yu SampleThreshold *= CSPreinlMultiplierForPrevInl;
193a4190037SHongtao Yu }
19430b02323SWenlei He
19530b02323SWenlei He return (Candidate.SizeCost < SampleThreshold);
19630b02323SWenlei He }
19730b02323SWenlei He
processFunction(const FunctionId Name)198ef0e0adcSWilliam Junda Huang void CSPreInliner::processFunction(const FunctionId Name) {
19930b02323SWenlei He FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name);
20030b02323SWenlei He if (!FSamples)
20130b02323SWenlei He return;
20230b02323SWenlei He
2037e86b13cSwlei unsigned FuncSize =
2047e86b13cSwlei getFuncSize(ContextTracker.getContextNodeForProfile(FSamples));
20530b02323SWenlei He unsigned FuncFinalSize = FuncSize;
20630b02323SWenlei He unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit;
20730b02323SWenlei He SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
20830b02323SWenlei He SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
20930b02323SWenlei He
210eca03d27SWenlei He LLVM_DEBUG(dbgs() << "Process " << Name
211eca03d27SWenlei He << " for context-sensitive pre-inlining (pre-inline size: "
212eca03d27SWenlei He << FuncSize << ", size limit: " << SizeLimit << ")\n");
213eca03d27SWenlei He
21430b02323SWenlei He ProfiledCandidateQueue CQueue;
21530b02323SWenlei He getInlineCandidates(CQueue, FSamples);
21630b02323SWenlei He
21730b02323SWenlei He while (!CQueue.empty() && FuncFinalSize < SizeLimit) {
21830b02323SWenlei He ProfiledInlineCandidate Candidate = CQueue.top();
21930b02323SWenlei He CQueue.pop();
22030b02323SWenlei He bool ShouldInline = false;
22130b02323SWenlei He if ((ShouldInline = shouldInline(Candidate))) {
22230b02323SWenlei He // We mark context as inlined as the corresponding context profile
22330b02323SWenlei He // won't be merged into that function's base profile.
224f10004e7SWenlei He ++PreInlNumCSInlined;
22530b02323SWenlei He ContextTracker.markContextSamplesInlined(Candidate.CalleeSamples);
22630b02323SWenlei He Candidate.CalleeSamples->getContext().setAttribute(
22730b02323SWenlei He ContextShouldBeInlined);
22830b02323SWenlei He FuncFinalSize += Candidate.SizeCost;
22930b02323SWenlei He getInlineCandidates(CQueue, Candidate.CalleeSamples);
230f10004e7SWenlei He } else {
231f10004e7SWenlei He ++PreInlNumCSNotInlined;
23230b02323SWenlei He }
2337e86b13cSwlei LLVM_DEBUG(
2347e86b13cSwlei dbgs() << (ShouldInline ? " Inlined" : " Outlined")
23530b02323SWenlei He << " context profile for: "
2367e86b13cSwlei << ContextTracker.getContextString(*Candidate.CalleeSamples)
23730b02323SWenlei He << " (callee size: " << Candidate.SizeCost
23830b02323SWenlei He << ", call count:" << Candidate.CallsiteCount << ")\n");
23930b02323SWenlei He }
24030b02323SWenlei He
241f10004e7SWenlei He if (!CQueue.empty()) {
242f10004e7SWenlei He if (SizeLimit == (unsigned)ProfileInlineLimitMax)
243f10004e7SWenlei He ++PreInlNumCSInlinedHitMaxLimit;
244f10004e7SWenlei He else if (SizeLimit == (unsigned)ProfileInlineLimitMin)
245f10004e7SWenlei He ++PreInlNumCSInlinedHitMinLimit;
246f10004e7SWenlei He else
247f10004e7SWenlei He ++PreInlNumCSInlinedHitGrowthLimit;
248f10004e7SWenlei He }
249f10004e7SWenlei He
25030b02323SWenlei He LLVM_DEBUG({
25130b02323SWenlei He if (!CQueue.empty())
25230b02323SWenlei He dbgs() << " Inline candidates ignored due to size limit (inliner "
25330b02323SWenlei He "original size: "
25430b02323SWenlei He << FuncSize << ", inliner final size: " << FuncFinalSize
25530b02323SWenlei He << ", size limit: " << SizeLimit << ")\n";
25630b02323SWenlei He
25730b02323SWenlei He while (!CQueue.empty()) {
25830b02323SWenlei He ProfiledInlineCandidate Candidate = CQueue.top();
25930b02323SWenlei He CQueue.pop();
26030b02323SWenlei He bool WasInlined =
26130b02323SWenlei He Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined);
2627e86b13cSwlei dbgs() << " "
2637e86b13cSwlei << ContextTracker.getContextString(*Candidate.CalleeSamples)
26430b02323SWenlei He << " (candidate size:" << Candidate.SizeCost
26530b02323SWenlei He << ", call count: " << Candidate.CallsiteCount << ", previously "
26630b02323SWenlei He << (WasInlined ? "inlined)\n" : "not inlined)\n");
26730b02323SWenlei He }
26830b02323SWenlei He });
26930b02323SWenlei He }
27030b02323SWenlei He
run()27130b02323SWenlei He void CSPreInliner::run() {
27230b02323SWenlei He #ifndef NDEBUG
2737e86b13cSwlei auto printProfileNames = [](SampleContextTracker &ContextTracker,
2747e86b13cSwlei bool IsInput) {
2757e86b13cSwlei uint32_t Size = 0;
2767e86b13cSwlei for (auto *Node : ContextTracker) {
2777e86b13cSwlei FunctionSamples *FSamples = Node->getFunctionSamples();
2787e86b13cSwlei if (FSamples) {
2797e86b13cSwlei Size++;
2807e86b13cSwlei dbgs() << " [" << ContextTracker.getContextString(Node) << "] "
2817e86b13cSwlei << FSamples->getTotalSamples() << ":"
2827e86b13cSwlei << FSamples->getHeadSamples() << "\n";
28330b02323SWenlei He }
2847e86b13cSwlei }
2857e86b13cSwlei dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles ("
2867e86b13cSwlei << Size << " total):\n";
28730b02323SWenlei He };
28830b02323SWenlei He #endif
28930b02323SWenlei He
2907e86b13cSwlei LLVM_DEBUG(printProfileNames(ContextTracker, true));
29130b02323SWenlei He
29230b02323SWenlei He // Execute global pre-inliner to estimate a global top-down inline
29330b02323SWenlei He // decision and merge profiles accordingly. This helps with profile
29430b02323SWenlei He // merge for ThinLTO otherwise we won't be able to merge profiles back
29530b02323SWenlei He // to base profile across module/thin-backend boundaries.
29630b02323SWenlei He // It also helps better compress context profile to control profile
29730b02323SWenlei He // size, as we now only need context profile for functions going to
29830b02323SWenlei He // be inlined.
299ef0e0adcSWilliam Junda Huang for (FunctionId FuncName : buildTopDownOrder()) {
30030b02323SWenlei He processFunction(FuncName);
30130b02323SWenlei He }
30230b02323SWenlei He
30330b02323SWenlei He // Not inlined context profiles are merged into its base, so we can
30430b02323SWenlei He // trim out such profiles from the output.
3057e86b13cSwlei for (auto *Node : ContextTracker) {
3067e86b13cSwlei FunctionSamples *FProfile = Node->getFunctionSamples();
3077e86b13cSwlei if (FProfile &&
3087e86b13cSwlei (Node->getParentContext() != &ContextTracker.getRootContext() &&
3097e86b13cSwlei !FProfile->getContext().hasState(InlinedContext))) {
3107e86b13cSwlei Node->setFunctionSamples(nullptr);
31130b02323SWenlei He }
31230b02323SWenlei He }
313e36786d1SHongtao Yu FunctionSamples::ProfileIsPreInlined = true;
314e36786d1SHongtao Yu
3157e86b13cSwlei LLVM_DEBUG(printProfileNames(ContextTracker, false));
31630b02323SWenlei He }
317