1 //===-- CSPreInliner.cpp - Profile guided preinliner -------------- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "CSPreInliner.h" 10 #include "llvm/ADT/SCCIterator.h" 11 #include <cstdint> 12 #include <queue> 13 14 #define DEBUG_TYPE "cs-preinliner" 15 16 using namespace llvm; 17 using namespace sampleprof; 18 19 static cl::opt<bool> EnableCSPreInliner( 20 "csspgo-preinliner", cl::Hidden, cl::init(false), 21 cl::desc("Run a global pre-inliner to merge context profile based on " 22 "estimated global top-down inline decisions")); 23 24 // The switches specify inline thresholds used in SampleProfileLoader inlining. 25 // TODO: the actual threshold to be tuned here because the size here is based 26 // on machine code not LLVM IR. 27 extern cl::opt<int> SampleHotCallSiteThreshold; 28 extern cl::opt<int> SampleColdCallSiteThreshold; 29 extern cl::opt<int> ProfileInlineGrowthLimit; 30 extern cl::opt<int> ProfileInlineLimitMin; 31 extern cl::opt<int> ProfileInlineLimitMax; 32 33 static cl::opt<bool> SamplePreInlineReplay( 34 "csspgo-replay-preinline", cl::Hidden, cl::init(false), 35 cl::desc( 36 "Replay previous inlining and adjust context profile accordingly")); 37 38 CSPreInliner::CSPreInliner(StringMap<FunctionSamples> &Profiles, 39 uint64_t HotThreshold, uint64_t ColdThreshold) 40 : ContextTracker(Profiles), ProfileMap(Profiles), 41 HotCountThreshold(HotThreshold), ColdCountThreshold(ColdThreshold) {} 42 43 std::vector<StringRef> CSPreInliner::buildTopDownOrder() { 44 std::vector<StringRef> Order; 45 ProfiledCallGraph ProfiledCG(ContextTracker); 46 47 // Now that we have a profiled call graph, construct top-down order 48 // by building up SCC and reversing SCC order. 49 scc_iterator<ProfiledCallGraph *> I = scc_begin(&ProfiledCG); 50 while (!I.isAtEnd()) { 51 for (ProfiledCallGraphNode *Node : *I) { 52 if (Node != ProfiledCG.getEntryNode()) 53 Order.push_back(Node->Name); 54 } 55 ++I; 56 } 57 std::reverse(Order.begin(), Order.end()); 58 59 return Order; 60 } 61 62 bool CSPreInliner::getInlineCandidates(ProfiledCandidateQueue &CQueue, 63 const FunctionSamples *CallerSamples) { 64 assert(CallerSamples && "Expect non-null caller samples"); 65 66 // Ideally we want to consider everything a function calls, but as far as 67 // context profile is concerned, only those frames that are children of 68 // current one in the trie is relavent. So we walk the trie instead of call 69 // targets from function profile. 70 ContextTrieNode *CallerNode = 71 ContextTracker.getContextFor(CallerSamples->getContext()); 72 73 bool HasNewCandidate = false; 74 for (auto &Child : CallerNode->getAllChildContext()) { 75 ContextTrieNode *CalleeNode = &Child.second; 76 FunctionSamples *CalleeSamples = CalleeNode->getFunctionSamples(); 77 if (!CalleeSamples) 78 continue; 79 80 // Call site count is more reliable, so we look up the corresponding call 81 // target profile in caller's context profile to retrieve call site count. 82 uint64_t CalleeEntryCount = CalleeSamples->getEntrySamples(); 83 uint64_t CallsiteCount = 0; 84 LineLocation Callsite = CalleeNode->getCallSiteLoc(); 85 if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) { 86 SampleRecord::CallTargetMap &TargetCounts = CallTargets.get(); 87 auto It = TargetCounts.find(CalleeSamples->getName()); 88 if (It != TargetCounts.end()) 89 CallsiteCount = It->second; 90 } 91 92 // TODO: call site and callee entry count should be mostly consistent, add 93 // check for that. 94 HasNewCandidate = true; 95 CQueue.emplace(CalleeSamples, std::max(CallsiteCount, CalleeEntryCount)); 96 } 97 98 return HasNewCandidate; 99 } 100 101 bool CSPreInliner::shouldInline(ProfiledInlineCandidate &Candidate) { 102 // If replay inline is requested, simply follow the inline decision of the 103 // profiled binary. 104 if (SamplePreInlineReplay) 105 return Candidate.CalleeSamples->getContext().hasAttribute( 106 ContextWasInlined); 107 108 // Adjust threshold based on call site hotness, only do this for callsite 109 // prioritized inliner because otherwise cost-benefit check is done earlier. 110 unsigned int SampleThreshold = SampleColdCallSiteThreshold; 111 if (Candidate.CallsiteCount > HotCountThreshold) 112 SampleThreshold = SampleHotCallSiteThreshold; 113 114 // TODO: for small cold functions, we may inlined them and we need to keep 115 // context profile accordingly. 116 if (Candidate.CallsiteCount < ColdCountThreshold) 117 SampleThreshold = SampleColdCallSiteThreshold; 118 119 return (Candidate.SizeCost < SampleThreshold); 120 } 121 122 void CSPreInliner::processFunction(const StringRef Name) { 123 LLVM_DEBUG(dbgs() << "Process " << Name 124 << " for context-sensitive pre-inlining\n"); 125 126 FunctionSamples *FSamples = ContextTracker.getBaseSamplesFor(Name); 127 if (!FSamples) 128 return; 129 130 // Use the number of lines/probes as proxy for function size for now. 131 // TODO: retrieve accurate size from dwarf or binary instead. 132 unsigned FuncSize = FSamples->getBodySamples().size(); 133 unsigned FuncFinalSize = FuncSize; 134 unsigned SizeLimit = FuncSize * ProfileInlineGrowthLimit; 135 SizeLimit = std::min(SizeLimit, (unsigned)ProfileInlineLimitMax); 136 SizeLimit = std::max(SizeLimit, (unsigned)ProfileInlineLimitMin); 137 138 ProfiledCandidateQueue CQueue; 139 getInlineCandidates(CQueue, FSamples); 140 141 while (!CQueue.empty() && FuncFinalSize < SizeLimit) { 142 ProfiledInlineCandidate Candidate = CQueue.top(); 143 CQueue.pop(); 144 bool ShouldInline = false; 145 if ((ShouldInline = shouldInline(Candidate))) { 146 // We mark context as inlined as the corresponding context profile 147 // won't be merged into that function's base profile. 148 ContextTracker.markContextSamplesInlined(Candidate.CalleeSamples); 149 Candidate.CalleeSamples->getContext().setAttribute( 150 ContextShouldBeInlined); 151 FuncFinalSize += Candidate.SizeCost; 152 getInlineCandidates(CQueue, Candidate.CalleeSamples); 153 } 154 LLVM_DEBUG(dbgs() << (ShouldInline ? " Inlined" : " Outlined") 155 << " context profile for: " 156 << Candidate.CalleeSamples->getNameWithContext() 157 << " (callee size: " << Candidate.SizeCost 158 << ", call count:" << Candidate.CallsiteCount << ")\n"); 159 } 160 161 LLVM_DEBUG({ 162 if (!CQueue.empty()) 163 dbgs() << " Inline candidates ignored due to size limit (inliner " 164 "original size: " 165 << FuncSize << ", inliner final size: " << FuncFinalSize 166 << ", size limit: " << SizeLimit << ")\n"; 167 168 while (!CQueue.empty()) { 169 ProfiledInlineCandidate Candidate = CQueue.top(); 170 CQueue.pop(); 171 bool WasInlined = 172 Candidate.CalleeSamples->getContext().hasAttribute(ContextWasInlined); 173 dbgs() << " " << Candidate.CalleeSamples->getNameWithContext() 174 << " (candidate size:" << Candidate.SizeCost 175 << ", call count: " << Candidate.CallsiteCount << ", previously " 176 << (WasInlined ? "inlined)\n" : "not inlined)\n"); 177 } 178 }); 179 } 180 181 void CSPreInliner::run() { 182 if (!EnableCSPreInliner) 183 return; 184 185 #ifndef NDEBUG 186 auto printProfileNames = [](StringMap<FunctionSamples> &Profiles, 187 bool IsInput) { 188 dbgs() << (IsInput ? "Input" : "Output") << " context-sensitive profiles (" 189 << Profiles.size() << " total):\n"; 190 for (auto &It : Profiles) { 191 const FunctionSamples &Samples = It.second; 192 dbgs() << " [" << Samples.getNameWithContext() << "] " 193 << Samples.getTotalSamples() << ":" << Samples.getHeadSamples() 194 << "\n"; 195 } 196 }; 197 #endif 198 199 LLVM_DEBUG(printProfileNames(ProfileMap, true)); 200 201 // Execute global pre-inliner to estimate a global top-down inline 202 // decision and merge profiles accordingly. This helps with profile 203 // merge for ThinLTO otherwise we won't be able to merge profiles back 204 // to base profile across module/thin-backend boundaries. 205 // It also helps better compress context profile to control profile 206 // size, as we now only need context profile for functions going to 207 // be inlined. 208 for (StringRef FuncName : buildTopDownOrder()) { 209 processFunction(FuncName); 210 } 211 212 // Not inlined context profiles are merged into its base, so we can 213 // trim out such profiles from the output. 214 std::vector<StringRef> ProfilesToBeRemoved; 215 for (auto &It : ProfileMap) { 216 SampleContext Context = It.second.getContext(); 217 if (!Context.isBaseContext() && !Context.hasState(InlinedContext)) { 218 assert(Context.hasState(MergedContext) && 219 "Not inlined context profile should be merged already"); 220 ProfilesToBeRemoved.push_back(It.first()); 221 } 222 } 223 224 for (StringRef ContextName : ProfilesToBeRemoved) { 225 ProfileMap.erase(ContextName); 226 } 227 228 // Make sure ProfileMap's key is consistent with FunctionSamples' name. 229 SampleContextTrimmer(ProfileMap).canonicalizeContextProfiles(); 230 231 LLVM_DEBUG(printProfileNames(ProfileMap, false)); 232 } 233