xref: /llvm-project/llvm/tools/llvm-profgen/CSPreInliner.cpp (revision 9423e459875b0dcdf24975976838d651a92f1bdb)
130b02323SWenlei He //===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===//
230b02323SWenlei He //
330b02323SWenlei He // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
430b02323SWenlei He // See https://llvm.org/LICENSE.txt for license information.
530b02323SWenlei He // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
630b02323SWenlei He //
730b02323SWenlei He //===----------------------------------------------------------------------===//
830b02323SWenlei He 
930b02323SWenlei He #include "CSPreInliner.h"
10eca03d27SWenlei He #include "ProfiledBinary.h"
1130b02323SWenlei He #include "llvm/ADT/SCCIterator.h"
12f10004e7SWenlei He #include "llvm/ADT/Statistic.h"
13db29f437Sserge-sans-paille #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
14e7247f10STom Stellard #include "llvm/Transforms/IPO/SampleProfile.h"
1530b02323SWenlei He #include <cstdint>
1630b02323SWenlei He #include <queue>
1730b02323SWenlei He 
1830b02323SWenlei He #define DEBUG_TYPE "cs-preinliner"
1930b02323SWenlei He 
2030b02323SWenlei He using namespace llvm;
2130b02323SWenlei He using namespace sampleprof;
2230b02323SWenlei He 
23f10004e7SWenlei He STATISTIC(PreInlNumCSInlined,
24f10004e7SWenlei He           "Number of functions inlined with context sensitive profile");
25f10004e7SWenlei He STATISTIC(PreInlNumCSNotInlined,
26f10004e7SWenlei He           "Number of functions not inlined with context sensitive profile");
27f10004e7SWenlei He STATISTIC(PreInlNumCSInlinedHitMinLimit,
28f10004e7SWenlei He           "Number of functions with FDO inline stopped due to min size limit");
29f10004e7SWenlei He STATISTIC(PreInlNumCSInlinedHitMaxLimit,
30f10004e7SWenlei He           "Number of functions with FDO inline stopped due to max size limit");
31f10004e7SWenlei He STATISTIC(
32f10004e7SWenlei He     PreInlNumCSInlinedHitGrowthLimit,
33f10004e7SWenlei He     "Number of functions with FDO inline stopped due to growth size limit");
34f10004e7SWenlei He 
3530b02323SWenlei He // The switches specify inline thresholds used in SampleProfileLoader inlining.
3630b02323SWenlei He // TODO: the actual threshold to be tuned here because the size here is based
3730b02323SWenlei He // on machine code not LLVM IR.
381e692113SFangrui Song namespace llvm {
39eca03d27SWenlei He cl::opt<bool> EnableCSPreInliner(
40f6f0409fSWenlei He     "csspgo-preinliner", cl::Hidden, cl::init(true),
41eca03d27SWenlei He     cl::desc("Run a global pre-inliner to merge context profile based on "
42eca03d27SWenlei He              "estimated global top-down inline decisions"));
43eca03d27SWenlei He 
44eca03d27SWenlei He cl::opt<bool> UseContextCostForPreInliner(
45446e2162SWenlei He     "use-context-cost-for-preinliner", cl::Hidden, cl::init(true),
46eca03d27SWenlei He     cl::desc("Use context-sensitive byte size cost for preinliner decisions"));
471e692113SFangrui Song } // namespace llvm
48eca03d27SWenlei He 
4930b02323SWenlei He static cl::opt<bool> SamplePreInlineReplay(
5030b02323SWenlei He     "csspgo-replay-preinline", cl::Hidden, cl::init(false),
5130b02323SWenlei He     cl::desc(
5230b02323SWenlei He         "Replay previous inlining and adjust context profile accordingly"));
5330b02323SWenlei He 
545cbcaf16SHongtao Yu static cl::opt<int> CSPreinlMultiplierForPrevInl(
555cbcaf16SHongtao Yu     "csspgo-preinliner-multiplier-for-previous-inlining", cl::Hidden,
565cbcaf16SHongtao Yu     cl::init(100),
575cbcaf16SHongtao Yu     cl::desc(
585cbcaf16SHongtao Yu         "Multiplier to bump up callsite threshold for previous inlining."));
595cbcaf16SHongtao Yu 
CSPreInliner(SampleContextTracker & Tracker,ProfiledBinary & Binary,ProfileSummary * Summary)607e86b13cSwlei CSPreInliner::CSPreInliner(SampleContextTracker &Tracker,
617e86b13cSwlei                            ProfiledBinary &Binary, ProfileSummary *Summary)
627ca80300SHongtao Yu     : UseContextCost(UseContextCostForPreInliner),
637ca80300SHongtao Yu       // TODO: Pass in a guid-to-name map in order for
647ca80300SHongtao Yu       // ContextTracker.getFuncNameFor to work, if `Profiles` can have md5 codes
657ca80300SHongtao Yu       // as their profile context.
667e86b13cSwlei       ContextTracker(Tracker), Binary(Binary), Summary(Summary) {
67f6f0409fSWenlei He   // Set default preinliner hot/cold call site threshold tuned with CSSPGO.
68f6f0409fSWenlei He   // for good performance with reasonable profile size.
69f6f0409fSWenlei He   if (!SampleHotCallSiteThreshold.getNumOccurrences())
70f6f0409fSWenlei He     SampleHotCallSiteThreshold = 1500;
71f6f0409fSWenlei He   if (!SampleColdCallSiteThreshold.getNumOccurrences())
72f6f0409fSWenlei He     SampleColdCallSiteThreshold = 0;
737d293744SHongtao Yu   if (!ProfileInlineLimitMax.getNumOccurrences())
7439eb1c61SHongtao Yu     ProfileInlineLimitMax = 50000;
75f6f0409fSWenlei He }
7630b02323SWenlei He 
buildTopDownOrder()77ef0e0adcSWilliam Junda Huang std::vector<FunctionId> CSPreInliner::buildTopDownOrder() {
78ef0e0adcSWilliam Junda Huang   std::vector<FunctionId> Order;
795c2ae37bSHongtao Yu   // Trim cold edges to get a more stable call graph. This allows for a more
805c2ae37bSHongtao Yu   // stable top-down order which in turns helps the stablity of the generated
815c2ae37bSHongtao Yu   // profile from run to run.
825c2ae37bSHongtao Yu   uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(
835c2ae37bSHongtao Yu       (Summary->getDetailedSummary()));
845c2ae37bSHongtao Yu   ProfiledCallGraph ProfiledCG(ContextTracker, ColdCountThreshold);
8530b02323SWenlei He 
8630b02323SWenlei He   // Now that we have a profiled call graph, construct top-down order
8730b02323SWenlei He   // by building up SCC and reversing SCC order.
8830b02323SWenlei He   scc_iterator<ProfiledCallGraph *> I = scc_begin(&ProfiledCG);
8930b02323SWenlei He   while (!I.isAtEnd()) {
90bf317f66SHongtao Yu     auto Range = *I;
91bf317f66SHongtao Yu     if (SortProfiledSCC) {
92bf317f66SHongtao Yu       // Sort nodes in one SCC based on callsite hotness.
93bf317f66SHongtao Yu       scc_member_iterator<ProfiledCallGraph *> SI(*I);
94bf317f66SHongtao Yu       Range = *SI;
95bf317f66SHongtao Yu     }
96bf317f66SHongtao Yu     for (auto *Node : Range) {
9730b02323SWenlei He       if (Node != ProfiledCG.getEntryNode())
9830b02323SWenlei He         Order.push_back(Node->Name);
9930b02323SWenlei He     }
10030b02323SWenlei He     ++I;
10130b02323SWenlei He   }
10230b02323SWenlei He   std::reverse(Order.begin(), Order.end());
10330b02323SWenlei He 
10430b02323SWenlei He   return Order;
10530b02323SWenlei He }
10630b02323SWenlei He 
getInlineCandidates(ProfiledCandidateQueue & CQueue,const FunctionSamples * CallerSamples)10730b02323SWenlei He bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue,
10830b02323SWenlei He                                        const FunctionSamples *CallerSamples) {
10930b02323SWenlei He   assert(CallerSamples && "Expect non-null caller samples");
11030b02323SWenlei He 
11130b02323SWenlei He   // Ideally we want to consider everything a function calls, but as far as
11230b02323SWenlei He   // context profile is concerned, only those frames that are children of
11330b02323SWenlei He   // current one in the trie is relavent. So we walk the trie instead of call
11430b02323SWenlei He   // targets from function profile.
11530b02323SWenlei He   ContextTrieNode *CallerNode =
1167e86b13cSwlei       ContextTracker.getContextNodeForProfile(CallerSamples);
11730b02323SWenlei He 
11830b02323SWenlei He   bool HasNewCandidate = false;
11930b02323SWenlei He   for (auto &Child : CallerNode->getAllChildContext()) {
12030b02323SWenlei He     ContextTrieNode *CalleeNode = &Child.second;
12130b02323SWenlei He     FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples();
12230b02323SWenlei He     if (!CalleeSamples)
12330b02323SWenlei He       continue;
12430b02323SWenlei He 
12530b02323SWenlei He     // Call site count is more reliable, so we look up the corresponding call
12630b02323SWenlei He     // target profile in caller's context profile to retrieve call site count.
1277b81a81dSMircea Trofin     uint64_t CalleeEntryCount = CalleeSamples->getHeadSamplesEstimate();
12830b02323SWenlei He     uint64_t CallsiteCount = 0;
12930b02323SWenlei He     LineLocation Callsite = CalleeNode->getCallSiteLoc();
13030b02323SWenlei He     if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) {
131*9423e459SBenjamin Kramer       auto It = CallTargets->find(CalleeSamples->getFunction());
132*9423e459SBenjamin Kramer       if (It != CallTargets->end())
13330b02323SWenlei He         CallsiteCount = It->second;
13430b02323SWenlei He     }
13530b02323SWenlei He 
13630b02323SWenlei He     // TODO: call site and callee entry count should be mostly consistent, add
13730b02323SWenlei He     // check for that.
13830b02323SWenlei He     HasNewCandidate = true;
1397e86b13cSwlei     uint32_t CalleeSize = getFuncSize(CalleeNode);
140eca03d27SWenlei He     CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount),
141eca03d27SWenlei He                    CalleeSize);
14230b02323SWenlei He   }
14330b02323SWenlei He 
14430b02323SWenlei He   return HasNewCandidate;
14530b02323SWenlei He }
14630b02323SWenlei He 
getFuncSize(const ContextTrieNode * ContextNode)1477e86b13cSwlei uint32_t CSPreInliner::getFuncSize(const ContextTrieNode *ContextNode) {
1487e86b13cSwlei   if (UseContextCost)
1497e86b13cSwlei     return Binary.getFuncSizeForContext(ContextNode);
150eca03d27SWenlei He 
1517e86b13cSwlei   return ContextNode->getFunctionSamples()->getBodySamples().size();
152eca03d27SWenlei He }
153eca03d27SWenlei He 
shouldInline(ProfiledInlineCandidate & Candidate)15430b02323SWenlei He bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) {
1555cbcaf16SHongtao Yu   bool WasInlined =
1565cbcaf16SHongtao Yu       Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined);
15730b02323SWenlei He   // If replay inline is requested, simply follow the inline decision of the
15830b02323SWenlei He   // profiled binary.
15930b02323SWenlei He   if (SamplePreInlineReplay)
1605cbcaf16SHongtao Yu     return WasInlined;
16130b02323SWenlei He 
16230b02323SWenlei He   unsigned int SampleThreshold = SampleColdCallSiteThreshold;
163a4190037SHongtao Yu   uint64_t ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(
164a4190037SHongtao Yu       (Summary->getDetailedSummary()));
16530b02323SWenlei He 
166a4190037SHongtao Yu   if (Candidate.CallsiteCount <= ColdCountThreshold)
16730b02323SWenlei He     SampleThreshold = SampleColdCallSiteThreshold;
168a4190037SHongtao Yu   else {
169a4190037SHongtao Yu     // Linearly adjust threshold based on normalized hotness, i.e, a value in
170a4190037SHongtao Yu     // [0,1]. Use 10% cutoff instead of the max count as the normalization
171a4190037SHongtao Yu     // upperbound for stability.
172a4190037SHongtao Yu     double NormalizationUpperBound =
173a4190037SHongtao Yu         ProfileSummaryBuilder::getEntryForPercentile(
174a4190037SHongtao Yu             Summary->getDetailedSummary(), 100000 /* 10% */)
175a4190037SHongtao Yu             .MinCount;
176a4190037SHongtao Yu     double NormalizationLowerBound = ColdCountThreshold;
177a4190037SHongtao Yu     double NormalizedHotness =
178a4190037SHongtao Yu         (Candidate.CallsiteCount - NormalizationLowerBound) /
179a4190037SHongtao Yu         (NormalizationUpperBound - NormalizationLowerBound);
180a4190037SHongtao Yu     if (NormalizedHotness > 1.0)
181a4190037SHongtao Yu       NormalizedHotness = 1.0;
182111fcb0dSFangrui Song     // Add 1 to ensure hot callsites get a non-zero threshold, which could
183a4190037SHongtao Yu     // happen when SampleColdCallSiteThreshold is 0. This is when we do not
184a4190037SHongtao Yu     // want any inlining for cold callsites.
185a4190037SHongtao Yu     SampleThreshold = SampleHotCallSiteThreshold * NormalizedHotness * 100 +
186a4190037SHongtao Yu                       SampleColdCallSiteThreshold + 1;
1875cbcaf16SHongtao Yu     // Bump up the threshold to favor previous compiler inline decision. The
1885cbcaf16SHongtao Yu     // compiler has more insight and knowledge about functions based on their IR
1895cbcaf16SHongtao Yu     // and attribures and should be able to make a more reasonable inline
1905cbcaf16SHongtao Yu     // decision.
1915cbcaf16SHongtao Yu     if (WasInlined)
1925cbcaf16SHongtao Yu       SampleThreshold *= CSPreinlMultiplierForPrevInl;
193a4190037SHongtao Yu   }
19430b02323SWenlei He 
19530b02323SWenlei He   return (Candidate.SizeCost < SampleThreshold);
19630b02323SWenlei He }
19730b02323SWenlei He 
processFunction(const FunctionId Name)198ef0e0adcSWilliam Junda Huang void CSPreInliner::processFunction(const FunctionId Name) {
19930b02323SWenlei He   FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name);
20030b02323SWenlei He   if (!FSamples)
20130b02323SWenlei He     return;
20230b02323SWenlei He 
2037e86b13cSwlei   unsigned FuncSize =
2047e86b13cSwlei       getFuncSize(ContextTracker.getContextNodeForProfile(FSamples));
20530b02323SWenlei He   unsigned FuncFinalSize = FuncSize;
20630b02323SWenlei He   unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit;
20730b02323SWenlei He   SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax);
20830b02323SWenlei He   SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin);
20930b02323SWenlei He 
210eca03d27SWenlei He   LLVM_DEBUG(dbgs() << "Process " << Name
211eca03d27SWenlei He                     << " for context-sensitive pre-inlining (pre-inline size: "
212eca03d27SWenlei He                     << FuncSize << ", size limit: " << SizeLimit << ")\n");
213eca03d27SWenlei He 
21430b02323SWenlei He   ProfiledCandidateQueue CQueue;
21530b02323SWenlei He   getInlineCandidates(CQueue, FSamples);
21630b02323SWenlei He 
21730b02323SWenlei He   while (!CQueue.empty() && FuncFinalSize < SizeLimit) {
21830b02323SWenlei He     ProfiledInlineCandidate Candidate = CQueue.top();
21930b02323SWenlei He     CQueue.pop();
22030b02323SWenlei He     bool ShouldInline = false;
22130b02323SWenlei He     if ((ShouldInline = shouldInline(Candidate))) {
22230b02323SWenlei He       // We mark context as inlined as the corresponding context profile
22330b02323SWenlei He       // won't be merged into that function's base profile.
224f10004e7SWenlei He       ++PreInlNumCSInlined;
22530b02323SWenlei He       ContextTracker.markContextSamplesInlined(Candidate.CalleeSamples);
22630b02323SWenlei He       Candidate.CalleeSamples->getContext().setAttribute(
22730b02323SWenlei He           ContextShouldBeInlined);
22830b02323SWenlei He       FuncFinalSize += Candidate.SizeCost;
22930b02323SWenlei He       getInlineCandidates(CQueue, Candidate.CalleeSamples);
230f10004e7SWenlei He     } else {
231f10004e7SWenlei He       ++PreInlNumCSNotInlined;
23230b02323SWenlei He     }
2337e86b13cSwlei     LLVM_DEBUG(
2347e86b13cSwlei         dbgs() << (ShouldInline ? "  Inlined" : "  Outlined")
23530b02323SWenlei He                << " context profile for: "
2367e86b13cSwlei                << ContextTracker.getContextString(*Candidate.CalleeSamples)
23730b02323SWenlei He                << " (callee size: " << Candidate.SizeCost
23830b02323SWenlei He                << ", call count:" << Candidate.CallsiteCount << ")\n");
23930b02323SWenlei He   }
24030b02323SWenlei He 
241f10004e7SWenlei He   if (!CQueue.empty()) {
242f10004e7SWenlei He     if (SizeLimit == (unsigned)ProfileInlineLimitMax)
243f10004e7SWenlei He       ++PreInlNumCSInlinedHitMaxLimit;
244f10004e7SWenlei He     else if (SizeLimit == (unsigned)ProfileInlineLimitMin)
245f10004e7SWenlei He       ++PreInlNumCSInlinedHitMinLimit;
246f10004e7SWenlei He     else
247f10004e7SWenlei He       ++PreInlNumCSInlinedHitGrowthLimit;
248f10004e7SWenlei He   }
249f10004e7SWenlei He 
25030b02323SWenlei He   LLVM_DEBUG({
25130b02323SWenlei He     if (!CQueue.empty())
25230b02323SWenlei He       dbgs() << "  Inline candidates ignored due to size limit (inliner "
25330b02323SWenlei He                 "original size: "
25430b02323SWenlei He              << FuncSize << ", inliner final size: " << FuncFinalSize
25530b02323SWenlei He              << ", size limit: " << SizeLimit << ")\n";
25630b02323SWenlei He 
25730b02323SWenlei He     while (!CQueue.empty()) {
25830b02323SWenlei He       ProfiledInlineCandidate Candidate = CQueue.top();
25930b02323SWenlei He       CQueue.pop();
26030b02323SWenlei He       bool WasInlined =
26130b02323SWenlei He           Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined);
2627e86b13cSwlei       dbgs() << "    "
2637e86b13cSwlei              << ContextTracker.getContextString(*Candidate.CalleeSamples)
26430b02323SWenlei He              << " (candidate size:" << Candidate.SizeCost
26530b02323SWenlei He              << ", call count: " << Candidate.CallsiteCount << ", previously "
26630b02323SWenlei He              << (WasInlined ? "inlined)\n" : "not inlined)\n");
26730b02323SWenlei He     }
26830b02323SWenlei He   });
26930b02323SWenlei He }
27030b02323SWenlei He 
run()27130b02323SWenlei He void CSPreInliner::run() {
27230b02323SWenlei He #ifndef NDEBUG
2737e86b13cSwlei   auto printProfileNames = [](SampleContextTracker &ContextTracker,
2747e86b13cSwlei                               bool IsInput) {
2757e86b13cSwlei     uint32_t Size = 0;
2767e86b13cSwlei     for (auto *Node : ContextTracker) {
2777e86b13cSwlei       FunctionSamples *FSamples = Node->getFunctionSamples();
2787e86b13cSwlei       if (FSamples) {
2797e86b13cSwlei         Size++;
2807e86b13cSwlei         dbgs() << "  [" << ContextTracker.getContextString(Node) << "] "
2817e86b13cSwlei                << FSamples->getTotalSamples() << ":"
2827e86b13cSwlei                << FSamples->getHeadSamples() << "\n";
28330b02323SWenlei He       }
2847e86b13cSwlei     }
2857e86b13cSwlei     dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles ("
2867e86b13cSwlei            << Size << " total):\n";
28730b02323SWenlei He   };
28830b02323SWenlei He #endif
28930b02323SWenlei He 
2907e86b13cSwlei   LLVM_DEBUG(printProfileNames(ContextTracker, true));
29130b02323SWenlei He 
29230b02323SWenlei He   // Execute global pre-inliner to estimate a global top-down inline
29330b02323SWenlei He   // decision and merge profiles accordingly. This helps with profile
29430b02323SWenlei He   // merge for ThinLTO otherwise we won't be able to merge profiles back
29530b02323SWenlei He   // to base profile across module/thin-backend boundaries.
29630b02323SWenlei He   // It also helps better compress context profile to control profile
29730b02323SWenlei He   // size, as we now only need context profile for functions going to
29830b02323SWenlei He   // be inlined.
299ef0e0adcSWilliam Junda Huang   for (FunctionId FuncName : buildTopDownOrder()) {
30030b02323SWenlei He     processFunction(FuncName);
30130b02323SWenlei He   }
30230b02323SWenlei He 
30330b02323SWenlei He   // Not inlined context profiles are merged into its base, so we can
30430b02323SWenlei He   // trim out such profiles from the output.
3057e86b13cSwlei   for (auto *Node : ContextTracker) {
3067e86b13cSwlei     FunctionSamples *FProfile = Node->getFunctionSamples();
3077e86b13cSwlei     if (FProfile &&
3087e86b13cSwlei         (Node->getParentContext() != &ContextTracker.getRootContext() &&
3097e86b13cSwlei          !FProfile->getContext().hasState(InlinedContext))) {
3107e86b13cSwlei       Node->setFunctionSamples(nullptr);
31130b02323SWenlei He     }
31230b02323SWenlei He   }
313e36786d1SHongtao Yu   FunctionSamples::ProfileIsPreInlined = true;
314e36786d1SHongtao Yu 
3157e86b13cSwlei   LLVM_DEBUG(printProfileNames(ContextTracker, false));
31630b02323SWenlei He }
317