xref: /llvm-project/llvm/tools/llvm-profgen/ProfileGenerator.cpp (revision ee09f7d1fc173f2b495838c925f2cf39a2b55369)
1 //===-- ProfileGenerator.cpp - Profile Generator  ---------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 #include "ProfileGenerator.h"
9 #include "ErrorHandling.h"
10 #include "MissingFrameInferrer.h"
11 #include "PerfReader.h"
12 #include "ProfiledBinary.h"
13 #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
14 #include "llvm/ProfileData/ProfileCommon.h"
15 #include <algorithm>
16 #include <float.h>
17 #include <unordered_set>
18 #include <utility>
19 
20 cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
21                                     cl::Required,
22                                     cl::desc("Output profile file"));
23 static cl::alias OutputA("o", cl::desc("Alias for --output"),
24                          cl::aliasopt(OutputFilename));
25 
26 static cl::opt<SampleProfileFormat> OutputFormat(
27     "format", cl::desc("Format of output profile"), cl::init(SPF_Ext_Binary),
28     cl::values(
29         clEnumValN(SPF_Binary, "binary", "Binary encoding (default)"),
30         clEnumValN(SPF_Ext_Binary, "extbinary", "Extensible binary encoding"),
31         clEnumValN(SPF_Text, "text", "Text encoding"),
32         clEnumValN(SPF_GCC, "gcc",
33                    "GCC encoding (only meaningful for -sample)")));
34 
35 static cl::opt<bool> UseMD5(
36     "use-md5", cl::Hidden,
37     cl::desc("Use md5 to represent function names in the output profile (only "
38              "meaningful for -extbinary)"));
39 
40 static cl::opt<bool> PopulateProfileSymbolList(
41     "populate-profile-symbol-list", cl::init(false), cl::Hidden,
42     cl::desc("Populate profile symbol list (only meaningful for -extbinary)"));
43 
44 static cl::opt<bool> FillZeroForAllFuncs(
45     "fill-zero-for-all-funcs", cl::init(false), cl::Hidden,
46     cl::desc("Attribute all functions' range with zero count "
47              "even it's not hit by any samples."));
48 
49 static cl::opt<int32_t, true> RecursionCompression(
50     "compress-recursion",
51     cl::desc("Compressing recursion by deduplicating adjacent frame "
52              "sequences up to the specified size. -1 means no size limit."),
53     cl::Hidden,
54     cl::location(llvm::sampleprof::CSProfileGenerator::MaxCompressionSize));
55 
56 static cl::opt<bool>
57     TrimColdProfile("trim-cold-profile",
58                     cl::desc("If the total count of the profile is smaller "
59                              "than threshold, it will be trimmed."));
60 
61 static cl::opt<bool> CSProfMergeColdContext(
62     "csprof-merge-cold-context", cl::init(true),
63     cl::desc("If the total count of context profile is smaller than "
64              "the threshold, it will be merged into context-less base "
65              "profile."));
66 
67 static cl::opt<uint32_t> CSProfMaxColdContextDepth(
68     "csprof-max-cold-context-depth", cl::init(1),
69     cl::desc("Keep the last K contexts while merging cold profile. 1 means the "
70              "context-less base profile"));
71 
72 static cl::opt<int, true> CSProfMaxContextDepth(
73     "csprof-max-context-depth",
74     cl::desc("Keep the last K contexts while merging profile. -1 means no "
75              "depth limit."),
76     cl::location(llvm::sampleprof::CSProfileGenerator::MaxContextDepth));
77 
78 static cl::opt<double> ProfileDensityThreshold(
79     "profile-density-threshold", llvm::cl::init(50),
80     llvm::cl::desc("If the profile density is below the given threshold, it "
81                    "will be suggested to increase the sampling rate."),
82     llvm::cl::Optional);
83 static cl::opt<bool> ShowDensity("show-density", llvm::cl::init(false),
84                                  llvm::cl::desc("show profile density details"),
85                                  llvm::cl::Optional);
86 static cl::opt<int> ProfileDensityCutOffHot(
87     "profile-density-cutoff-hot", llvm::cl::init(990000),
88     llvm::cl::desc("Total samples cutoff for functions used to calculate "
89                    "profile density."));
90 
91 static cl::opt<bool> UpdateTotalSamples(
92     "update-total-samples", llvm::cl::init(false),
93     llvm::cl::desc(
94         "Update total samples by accumulating all its body samples."),
95     llvm::cl::Optional);
96 
97 static cl::opt<bool> GenCSNestedProfile(
98     "gen-cs-nested-profile", cl::Hidden, cl::init(true),
99     cl::desc("Generate nested function profiles for CSSPGO"));
100 
101 cl::opt<bool> InferMissingFrames(
102     "infer-missing-frames", llvm::cl::init(true),
103     llvm::cl::desc(
104         "Infer missing call frames due to compiler tail call elimination."),
105     llvm::cl::Optional);
106 
107 using namespace llvm;
108 using namespace sampleprof;
109 
110 namespace llvm {
111 extern cl::opt<int> ProfileSummaryCutoffHot;
112 extern cl::opt<bool> UseContextLessSummary;
113 
114 namespace sampleprof {
115 
116 // Initialize the MaxCompressionSize to -1 which means no size limit
117 int32_t CSProfileGenerator::MaxCompressionSize = -1;
118 
119 int CSProfileGenerator::MaxContextDepth = -1;
120 
121 bool ProfileGeneratorBase::UseFSDiscriminator = false;
122 
123 std::unique_ptr<ProfileGeneratorBase>
124 ProfileGeneratorBase::create(ProfiledBinary *Binary,
125                              const ContextSampleCounterMap *SampleCounters,
126                              bool ProfileIsCS) {
127   std::unique_ptr<ProfileGeneratorBase> Generator;
128   if (ProfileIsCS) {
129     Generator.reset(new CSProfileGenerator(Binary, SampleCounters));
130   } else {
131     Generator.reset(new ProfileGenerator(Binary, SampleCounters));
132   }
133   ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator();
134   FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator();
135 
136   return Generator;
137 }
138 
139 std::unique_ptr<ProfileGeneratorBase>
140 ProfileGeneratorBase::create(ProfiledBinary *Binary, SampleProfileMap &Profiles,
141                              bool ProfileIsCS) {
142   std::unique_ptr<ProfileGeneratorBase> Generator;
143   if (ProfileIsCS) {
144     Generator.reset(new CSProfileGenerator(Binary, Profiles));
145   } else {
146     Generator.reset(new ProfileGenerator(Binary, std::move(Profiles)));
147   }
148   ProfileGeneratorBase::UseFSDiscriminator = Binary->useFSDiscriminator();
149   FunctionSamples::ProfileIsFS = Binary->useFSDiscriminator();
150 
151   return Generator;
152 }
153 
154 void ProfileGeneratorBase::write(std::unique_ptr<SampleProfileWriter> Writer,
155                                  SampleProfileMap &ProfileMap) {
156   // Populate profile symbol list if extended binary format is used.
157   ProfileSymbolList SymbolList;
158 
159   if (PopulateProfileSymbolList && OutputFormat == SPF_Ext_Binary) {
160     Binary->populateSymbolListFromDWARF(SymbolList);
161     Writer->setProfileSymbolList(&SymbolList);
162   }
163 
164   if (std::error_code EC = Writer->write(ProfileMap))
165     exitWithError(std::move(EC));
166 }
167 
168 void ProfileGeneratorBase::write() {
169   auto WriterOrErr = SampleProfileWriter::create(OutputFilename, OutputFormat);
170   if (std::error_code EC = WriterOrErr.getError())
171     exitWithError(EC, OutputFilename);
172 
173   if (UseMD5) {
174     if (OutputFormat != SPF_Ext_Binary)
175       WithColor::warning() << "-use-md5 is ignored. Specify "
176                               "--format=extbinary to enable it\n";
177     else
178       WriterOrErr.get()->setUseMD5();
179   }
180 
181   write(std::move(WriterOrErr.get()), ProfileMap);
182 }
183 
184 void ProfileGeneratorBase::showDensitySuggestion(double Density) {
185   if (Density == 0.0)
186     WithColor::warning() << "The output profile is empty or the "
187                             "--profile-density-cutoff-hot option is "
188                             "set too low. Please check your command.\n";
189   else if (Density < ProfileDensityThreshold)
190     WithColor::warning()
191         << "Sample PGO is estimated to optimize better with "
192         << format("%.1f", ProfileDensityThreshold / Density)
193         << "x more samples. Please consider increasing sampling rate or "
194            "profiling for longer duration to get more samples.\n";
195 
196   if (ShowDensity)
197     outs() << "Functions with density >= " << format("%.1f", Density)
198            << " account for "
199            << format("%.2f",
200                      static_cast<double>(ProfileDensityCutOffHot) / 10000)
201            << "% total sample counts.\n";
202 }
203 
204 bool ProfileGeneratorBase::filterAmbiguousProfile(FunctionSamples &FS) {
205   for (const auto &Prefix : FuncPrefixsToFilter) {
206     if (FS.getFuncName().starts_with(Prefix))
207       return true;
208   }
209 
210   // Filter the function profiles for the inlinees. It's useful for fuzzy
211   // profile matching which flattens the profile and inlinees' samples are
212   // merged into top-level function.
213   for (auto &Callees :
214        const_cast<CallsiteSampleMap &>(FS.getCallsiteSamples())) {
215     auto &CalleesMap = Callees.second;
216     for (auto I = CalleesMap.begin(); I != CalleesMap.end();) {
217       auto FS = I++;
218       if (filterAmbiguousProfile(FS->second))
219         CalleesMap.erase(FS);
220     }
221   }
222   return false;
223 }
224 
225 // For built-in local initialization function such as __cxx_global_var_init,
226 // __tls_init prefix function, there could be multiple versions of the functions
227 // in the final binary. However, in the profile generation, we call
228 // getCanonicalFnName to canonicalize the names which strips the suffixes.
229 // Therefore, samples from different functions queries the same profile and the
230 // samples are merged. As the functions are essentially different, entries of
231 // the merged profile are ambiguous. In sample loader, the IR from one version
232 // would be attributed towards a merged entries, which is inaccurate. Especially
233 // for fuzzy profile matching, it gets multiple callsites(from different
234 // function) but used to match one callsite, which misleads the matching and
235 // causes a lot of false positives report. Hence, we want to filter them out
236 // from the profile map during the profile generation time. The profiles are all
237 // cold functions, it won't have perf impact.
238 void ProfileGeneratorBase::filterAmbiguousProfile(SampleProfileMap &Profiles) {
239   for (auto I = ProfileMap.begin(); I != ProfileMap.end();) {
240     auto FS = I++;
241     if (filterAmbiguousProfile(FS->second))
242       ProfileMap.erase(FS);
243   }
244 }
245 
246 void ProfileGeneratorBase::findDisjointRanges(RangeSample &DisjointRanges,
247                                               const RangeSample &Ranges) {
248 
249   /*
250   Regions may overlap with each other. Using the boundary info, find all
251   disjoint ranges and their sample count. BoundaryPoint contains the count
252   multiple samples begin/end at this points.
253 
254   |<--100-->|           Sample1
255   |<------200------>|   Sample2
256   A         B       C
257 
258   In the example above,
259   Sample1 begins at A, ends at B, its value is 100.
260   Sample2 beings at A, ends at C, its value is 200.
261   For A, BeginCount is the sum of sample begins at A, which is 300 and no
262   samples ends at A, so EndCount is 0.
263   Then boundary points A, B, and C with begin/end counts are:
264   A: (300, 0)
265   B: (0, 100)
266   C: (0, 200)
267   */
268   struct BoundaryPoint {
269     // Sum of sample counts beginning at this point
270     uint64_t BeginCount = UINT64_MAX;
271     // Sum of sample counts ending at this point
272     uint64_t EndCount = UINT64_MAX;
273     // Is the begin point of a zero range.
274     bool IsZeroRangeBegin = false;
275     // Is the end point of a zero range.
276     bool IsZeroRangeEnd = false;
277 
278     void addBeginCount(uint64_t Count) {
279       if (BeginCount == UINT64_MAX)
280         BeginCount = 0;
281       BeginCount += Count;
282     }
283 
284     void addEndCount(uint64_t Count) {
285       if (EndCount == UINT64_MAX)
286         EndCount = 0;
287       EndCount += Count;
288     }
289   };
290 
291   /*
292   For the above example. With boundary points, follwing logic finds two
293   disjoint region of
294 
295   [A,B]:   300
296   [B+1,C]: 200
297 
298   If there is a boundary point that both begin and end, the point itself
299   becomes a separate disjoint region. For example, if we have original
300   ranges of
301 
302   |<--- 100 --->|
303                 |<--- 200 --->|
304   A             B             C
305 
306   there are three boundary points with their begin/end counts of
307 
308   A: (100, 0)
309   B: (200, 100)
310   C: (0, 200)
311 
312   the disjoint ranges would be
313 
314   [A, B-1]: 100
315   [B, B]:   300
316   [B+1, C]: 200.
317 
318   Example for zero value range:
319 
320     |<--- 100 --->|
321                        |<--- 200 --->|
322   |<---------------  0 ----------------->|
323   A  B            C    D             E   F
324 
325   [A, B-1]  : 0
326   [B, C]    : 100
327   [C+1, D-1]: 0
328   [D, E]    : 200
329   [E+1, F]  : 0
330   */
331   std::map<uint64_t, BoundaryPoint> Boundaries;
332 
333   for (const auto &Item : Ranges) {
334     assert(Item.first.first <= Item.first.second &&
335            "Invalid instruction range");
336     auto &BeginPoint = Boundaries[Item.first.first];
337     auto &EndPoint = Boundaries[Item.first.second];
338     uint64_t Count = Item.second;
339 
340     BeginPoint.addBeginCount(Count);
341     EndPoint.addEndCount(Count);
342     if (Count == 0) {
343       BeginPoint.IsZeroRangeBegin = true;
344       EndPoint.IsZeroRangeEnd = true;
345     }
346   }
347 
348   // Use UINT64_MAX to indicate there is no existing range between BeginAddress
349   // and the next valid address
350   uint64_t BeginAddress = UINT64_MAX;
351   int ZeroRangeDepth = 0;
352   uint64_t Count = 0;
353   for (const auto &Item : Boundaries) {
354     uint64_t Address = Item.first;
355     const BoundaryPoint &Point = Item.second;
356     if (Point.BeginCount != UINT64_MAX) {
357       if (BeginAddress != UINT64_MAX)
358         DisjointRanges[{BeginAddress, Address - 1}] = Count;
359       Count += Point.BeginCount;
360       BeginAddress = Address;
361       ZeroRangeDepth += Point.IsZeroRangeBegin;
362     }
363     if (Point.EndCount != UINT64_MAX) {
364       assert((BeginAddress != UINT64_MAX) &&
365              "First boundary point cannot be 'end' point");
366       DisjointRanges[{BeginAddress, Address}] = Count;
367       assert(Count >= Point.EndCount && "Mismatched live ranges");
368       Count -= Point.EndCount;
369       BeginAddress = Address + 1;
370       ZeroRangeDepth -= Point.IsZeroRangeEnd;
371       // If the remaining count is zero and it's no longer in a zero range, this
372       // means we consume all the ranges before, thus mark BeginAddress as
373       // UINT64_MAX. e.g. supposing we have two non-overlapping ranges:
374       //  [<---- 10 ---->]
375       //                       [<---- 20 ---->]
376       //   A             B     C              D
377       // The BeginAddress(B+1) will reset to invalid(UINT64_MAX), so we won't
378       // have the [B+1, C-1] zero range.
379       if (Count == 0 && ZeroRangeDepth == 0)
380         BeginAddress = UINT64_MAX;
381     }
382   }
383 }
384 
385 void ProfileGeneratorBase::updateBodySamplesforFunctionProfile(
386     FunctionSamples &FunctionProfile, const SampleContextFrame &LeafLoc,
387     uint64_t Count) {
388   // Use the maximum count of samples with same line location
389   uint32_t Discriminator = getBaseDiscriminator(LeafLoc.Location.Discriminator);
390 
391   // Use duplication factor to compensated for loop unroll/vectorization.
392   // Note that this is only needed when we're taking MAX of the counts at
393   // the location instead of SUM.
394   Count *= getDuplicationFactor(LeafLoc.Location.Discriminator);
395 
396   ErrorOr<uint64_t> R =
397       FunctionProfile.findSamplesAt(LeafLoc.Location.LineOffset, Discriminator);
398 
399   uint64_t PreviousCount = R ? R.get() : 0;
400   if (PreviousCount <= Count) {
401     FunctionProfile.addBodySamples(LeafLoc.Location.LineOffset, Discriminator,
402                                    Count - PreviousCount);
403   }
404 }
405 
406 void ProfileGeneratorBase::updateTotalSamples() {
407   for (auto &Item : ProfileMap) {
408     FunctionSamples &FunctionProfile = Item.second;
409     FunctionProfile.updateTotalSamples();
410   }
411 }
412 
413 void ProfileGeneratorBase::updateCallsiteSamples() {
414   for (auto &Item : ProfileMap) {
415     FunctionSamples &FunctionProfile = Item.second;
416     FunctionProfile.updateCallsiteSamples();
417   }
418 }
419 
420 void ProfileGeneratorBase::updateFunctionSamples() {
421   updateCallsiteSamples();
422 
423   if (UpdateTotalSamples)
424     updateTotalSamples();
425 }
426 
427 void ProfileGeneratorBase::collectProfiledFunctions() {
428   std::unordered_set<const BinaryFunction *> ProfiledFunctions;
429   if (collectFunctionsFromRawProfile(ProfiledFunctions))
430     Binary->setProfiledFunctions(ProfiledFunctions);
431   else if (collectFunctionsFromLLVMProfile(ProfiledFunctions))
432     Binary->setProfiledFunctions(ProfiledFunctions);
433   else
434     llvm_unreachable("Unsupported input profile");
435 }
436 
437 bool ProfileGeneratorBase::collectFunctionsFromRawProfile(
438     std::unordered_set<const BinaryFunction *> &ProfiledFunctions) {
439   if (!SampleCounters)
440     return false;
441   // Go through all the stacks, ranges and branches in sample counters, use
442   // the start of the range to look up the function it belongs and record the
443   // function.
444   for (const auto &CI : *SampleCounters) {
445     if (const auto *CtxKey = dyn_cast<AddrBasedCtxKey>(CI.first.getPtr())) {
446       for (auto StackAddr : CtxKey->Context) {
447         if (FuncRange *FRange = Binary->findFuncRange(StackAddr))
448           ProfiledFunctions.insert(FRange->Func);
449       }
450     }
451 
452     for (auto Item : CI.second.RangeCounter) {
453       uint64_t StartAddress = Item.first.first;
454       if (FuncRange *FRange = Binary->findFuncRange(StartAddress))
455         ProfiledFunctions.insert(FRange->Func);
456     }
457 
458     for (auto Item : CI.second.BranchCounter) {
459       uint64_t SourceAddress = Item.first.first;
460       uint64_t TargetAddress = Item.first.second;
461       if (FuncRange *FRange = Binary->findFuncRange(SourceAddress))
462         ProfiledFunctions.insert(FRange->Func);
463       if (FuncRange *FRange = Binary->findFuncRange(TargetAddress))
464         ProfiledFunctions.insert(FRange->Func);
465     }
466   }
467   return true;
468 }
469 
470 bool ProfileGenerator::collectFunctionsFromLLVMProfile(
471     std::unordered_set<const BinaryFunction *> &ProfiledFunctions) {
472   for (const auto &FS : ProfileMap) {
473     if (auto *Func = Binary->getBinaryFunction(FS.second.getFunction()))
474       ProfiledFunctions.insert(Func);
475   }
476   return true;
477 }
478 
479 bool CSProfileGenerator::collectFunctionsFromLLVMProfile(
480     std::unordered_set<const BinaryFunction *> &ProfiledFunctions) {
481   for (auto *Node : ContextTracker) {
482     if (!Node->getFuncName().empty())
483       if (auto *Func = Binary->getBinaryFunction(Node->getFuncName()))
484         ProfiledFunctions.insert(Func);
485   }
486   return true;
487 }
488 
489 FunctionSamples &
490 ProfileGenerator::getTopLevelFunctionProfile(FunctionId FuncName) {
491   SampleContext Context(FuncName);
492   return ProfileMap.create(Context);
493 }
494 
495 void ProfileGenerator::generateProfile() {
496   collectProfiledFunctions();
497 
498   if (Binary->usePseudoProbes())
499     Binary->decodePseudoProbe();
500 
501   if (SampleCounters) {
502     if (Binary->usePseudoProbes()) {
503       generateProbeBasedProfile();
504     } else {
505       generateLineNumBasedProfile();
506     }
507   }
508 
509   postProcessProfiles();
510 }
511 
512 void ProfileGenerator::postProcessProfiles() {
513   computeSummaryAndThreshold(ProfileMap);
514   trimColdProfiles(ProfileMap, ColdCountThreshold);
515   filterAmbiguousProfile(ProfileMap);
516   calculateAndShowDensity(ProfileMap);
517 }
518 
519 void ProfileGenerator::trimColdProfiles(const SampleProfileMap &Profiles,
520                                         uint64_t ColdCntThreshold) {
521   if (!TrimColdProfile)
522     return;
523 
524   // Move cold profiles into a tmp container.
525   std::vector<hash_code> ColdProfileHashes;
526   for (const auto &I : ProfileMap) {
527     if (I.second.getTotalSamples() < ColdCntThreshold)
528       ColdProfileHashes.emplace_back(I.first);
529   }
530 
531   // Remove the cold profile from ProfileMap.
532   for (const auto &I : ColdProfileHashes)
533     ProfileMap.erase(I);
534 }
535 
536 void ProfileGenerator::generateLineNumBasedProfile() {
537   assert(SampleCounters->size() == 1 &&
538          "Must have one entry for profile generation.");
539   const SampleCounter &SC = SampleCounters->begin()->second;
540   // Fill in function body samples
541   populateBodySamplesForAllFunctions(SC.RangeCounter);
542   // Fill in boundary sample counts as well as call site samples for calls
543   populateBoundarySamplesForAllFunctions(SC.BranchCounter);
544 
545   updateFunctionSamples();
546 }
547 
548 void ProfileGenerator::generateProbeBasedProfile() {
549   assert(SampleCounters->size() == 1 &&
550          "Must have one entry for profile generation.");
551   // Enable pseudo probe functionalities in SampleProf
552   FunctionSamples::ProfileIsProbeBased = true;
553   const SampleCounter &SC = SampleCounters->begin()->second;
554   // Fill in function body samples
555   populateBodySamplesWithProbesForAllFunctions(SC.RangeCounter);
556   // Fill in boundary sample counts as well as call site samples for calls
557   populateBoundarySamplesWithProbesForAllFunctions(SC.BranchCounter);
558 
559   updateFunctionSamples();
560 }
561 
562 void ProfileGenerator::populateBodySamplesWithProbesForAllFunctions(
563     const RangeSample &RangeCounter) {
564   ProbeCounterMap ProbeCounter;
565   // preprocessRangeCounter returns disjoint ranges, so no longer to redo it
566   // inside extractProbesFromRange.
567   extractProbesFromRange(preprocessRangeCounter(RangeCounter), ProbeCounter,
568                          false);
569 
570   for (const auto &PI : ProbeCounter) {
571     const MCDecodedPseudoProbe *Probe = PI.first;
572     uint64_t Count = PI.second;
573     SampleContextFrameVector FrameVec;
574     Binary->getInlineContextForProbe(Probe, FrameVec, true);
575     FunctionSamples &FunctionProfile =
576         getLeafProfileAndAddTotalSamples(FrameVec, Count);
577     FunctionProfile.addBodySamples(Probe->getIndex(), Probe->getDiscriminator(),
578                                    Count);
579     if (Probe->isEntry())
580       FunctionProfile.addHeadSamples(Count);
581   }
582 }
583 
584 void ProfileGenerator::populateBoundarySamplesWithProbesForAllFunctions(
585     const BranchSample &BranchCounters) {
586   for (const auto &Entry : BranchCounters) {
587     uint64_t SourceAddress = Entry.first.first;
588     uint64_t TargetAddress = Entry.first.second;
589     uint64_t Count = Entry.second;
590     assert(Count != 0 && "Unexpected zero weight branch");
591 
592     StringRef CalleeName = getCalleeNameForAddress(TargetAddress);
593     if (CalleeName.size() == 0)
594       continue;
595 
596     const MCDecodedPseudoProbe *CallProbe =
597         Binary->getCallProbeForAddr(SourceAddress);
598     if (CallProbe == nullptr)
599       continue;
600 
601     // Record called target sample and its count.
602     SampleContextFrameVector FrameVec;
603     Binary->getInlineContextForProbe(CallProbe, FrameVec, true);
604 
605     if (!FrameVec.empty()) {
606       FunctionSamples &FunctionProfile =
607           getLeafProfileAndAddTotalSamples(FrameVec, 0);
608       FunctionProfile.addCalledTargetSamples(
609           FrameVec.back().Location.LineOffset,
610           FrameVec.back().Location.Discriminator,
611           FunctionId(CalleeName), Count);
612     }
613   }
614 }
615 
616 FunctionSamples &ProfileGenerator::getLeafProfileAndAddTotalSamples(
617     const SampleContextFrameVector &FrameVec, uint64_t Count) {
618   // Get top level profile
619   FunctionSamples *FunctionProfile =
620       &getTopLevelFunctionProfile(FrameVec[0].Func);
621   FunctionProfile->addTotalSamples(Count);
622   if (Binary->usePseudoProbes()) {
623     const auto *FuncDesc = Binary->getFuncDescForGUID(
624         FunctionProfile->getFunction().getHashCode());
625     FunctionProfile->setFunctionHash(FuncDesc->FuncHash);
626   }
627 
628   for (size_t I = 1; I < FrameVec.size(); I++) {
629     LineLocation Callsite(
630         FrameVec[I - 1].Location.LineOffset,
631         getBaseDiscriminator(FrameVec[I - 1].Location.Discriminator));
632     FunctionSamplesMap &SamplesMap =
633         FunctionProfile->functionSamplesAt(Callsite);
634     auto Ret =
635         SamplesMap.emplace(FrameVec[I].Func, FunctionSamples());
636     if (Ret.second) {
637       SampleContext Context(FrameVec[I].Func);
638       Ret.first->second.setContext(Context);
639     }
640     FunctionProfile = &Ret.first->second;
641     FunctionProfile->addTotalSamples(Count);
642     if (Binary->usePseudoProbes()) {
643       const auto *FuncDesc = Binary->getFuncDescForGUID(
644           FunctionProfile->getFunction().getHashCode());
645       FunctionProfile->setFunctionHash(FuncDesc->FuncHash);
646     }
647   }
648 
649   return *FunctionProfile;
650 }
651 
652 RangeSample
653 ProfileGenerator::preprocessRangeCounter(const RangeSample &RangeCounter) {
654   RangeSample Ranges(RangeCounter.begin(), RangeCounter.end());
655   if (FillZeroForAllFuncs) {
656     for (auto &FuncI : Binary->getAllBinaryFunctions()) {
657       for (auto &R : FuncI.second.Ranges) {
658         Ranges[{R.first, R.second - 1}] += 0;
659       }
660     }
661   } else {
662     // For each range, we search for all ranges of the function it belongs to
663     // and initialize it with zero count, so it remains zero if doesn't hit any
664     // samples. This is to be consistent with compiler that interpret zero count
665     // as unexecuted(cold).
666     for (const auto &I : RangeCounter) {
667       uint64_t StartAddress = I.first.first;
668       for (const auto &Range : Binary->getRanges(StartAddress))
669         Ranges[{Range.first, Range.second - 1}] += 0;
670     }
671   }
672   RangeSample DisjointRanges;
673   findDisjointRanges(DisjointRanges, Ranges);
674   return DisjointRanges;
675 }
676 
677 void ProfileGenerator::populateBodySamplesForAllFunctions(
678     const RangeSample &RangeCounter) {
679   for (const auto &Range : preprocessRangeCounter(RangeCounter)) {
680     uint64_t RangeBegin = Range.first.first;
681     uint64_t RangeEnd = Range.first.second;
682     uint64_t Count = Range.second;
683 
684     InstructionPointer IP(Binary, RangeBegin, true);
685     // Disjoint ranges may have range in the middle of two instr,
686     // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
687     // can be Addr1+1 to Addr2-1. We should ignore such range.
688     if (IP.Address > RangeEnd)
689       continue;
690 
691     do {
692       const SampleContextFrameVector FrameVec =
693           Binary->getFrameLocationStack(IP.Address);
694       if (!FrameVec.empty()) {
695         // FIXME: As accumulating total count per instruction caused some
696         // regression, we changed to accumulate total count per byte as a
697         // workaround. Tuning hotness threshold on the compiler side might be
698         // necessary in the future.
699         FunctionSamples &FunctionProfile = getLeafProfileAndAddTotalSamples(
700             FrameVec, Count * Binary->getInstSize(IP.Address));
701         updateBodySamplesforFunctionProfile(FunctionProfile, FrameVec.back(),
702                                             Count);
703       }
704     } while (IP.advance() && IP.Address <= RangeEnd);
705   }
706 }
707 
708 StringRef
709 ProfileGeneratorBase::getCalleeNameForAddress(uint64_t TargetAddress) {
710   // Get the function range by branch target if it's a call branch.
711   auto *FRange = Binary->findFuncRangeForStartAddr(TargetAddress);
712 
713   // We won't accumulate sample count for a range whose start is not the real
714   // function entry such as outlined function or inner labels.
715   if (!FRange || !FRange->IsFuncEntry)
716     return StringRef();
717 
718   return FunctionSamples::getCanonicalFnName(FRange->getFuncName());
719 }
720 
721 void ProfileGenerator::populateBoundarySamplesForAllFunctions(
722     const BranchSample &BranchCounters) {
723   for (const auto &Entry : BranchCounters) {
724     uint64_t SourceAddress = Entry.first.first;
725     uint64_t TargetAddress = Entry.first.second;
726     uint64_t Count = Entry.second;
727     assert(Count != 0 && "Unexpected zero weight branch");
728 
729     StringRef CalleeName = getCalleeNameForAddress(TargetAddress);
730     if (CalleeName.size() == 0)
731       continue;
732     // Record called target sample and its count.
733     const SampleContextFrameVector &FrameVec =
734         Binary->getCachedFrameLocationStack(SourceAddress);
735     if (!FrameVec.empty()) {
736       FunctionSamples &FunctionProfile =
737           getLeafProfileAndAddTotalSamples(FrameVec, 0);
738       FunctionProfile.addCalledTargetSamples(
739           FrameVec.back().Location.LineOffset,
740           getBaseDiscriminator(FrameVec.back().Location.Discriminator),
741           FunctionId(CalleeName), Count);
742     }
743     // Add head samples for callee.
744     FunctionSamples &CalleeProfile =
745         getTopLevelFunctionProfile(FunctionId(CalleeName));
746     CalleeProfile.addHeadSamples(Count);
747   }
748 }
749 
750 void ProfileGeneratorBase::calculateBodySamplesAndSize(
751     const FunctionSamples &FSamples, uint64_t &TotalBodySamples,
752     uint64_t &FuncBodySize) {
753   // Note that ideally the size should be the number of function instruction.
754   // However, for probe-based profile, we don't have the accurate instruction
755   // count for each probe, instead, the probe sample is the samples count for
756   // the block, which is equivelant to
757   // total_instruction_samples/num_of_instruction in one block. Hence, we use
758   // the number of probe as a proxy for the function's size.
759   FuncBodySize += FSamples.getBodySamples().size();
760 
761   // The accumulated body samples re-calculated here could be different from the
762   // TotalSamples(getTotalSamples) field of FunctionSamples for line-number
763   // based profile. The reason is that TotalSamples is the sum of all the
764   // samples of the machine instruction in one source-code line, however, the
765   // entry of Bodysamples is the only max number of them, so the TotalSamples is
766   // usually much bigger than the accumulated body samples as one souce-code
767   // line can emit many machine instructions. We observed a regression when we
768   // switched to use the accumulated body samples(by using
769   // -update-total-samples). Hence, it's safer to re-calculate here to avoid
770   // such discrepancy. There is no problem for probe-based profile, as the
771   // TotalSamples is exactly the same as the accumulated body samples.
772   for (const auto &I : FSamples.getBodySamples())
773     TotalBodySamples += I.second.getSamples();
774 
775   for (const auto &CallsiteSamples : FSamples.getCallsiteSamples())
776     for (const auto &Callee : CallsiteSamples.second) {
777       // For binary-level density, the inlinees' samples and size should be
778       // included in the calculation.
779       calculateBodySamplesAndSize(Callee.second, TotalBodySamples,
780                                   FuncBodySize);
781     }
782 }
783 
784 // Calculate Profile-density:
785 // Calculate the density for each function and sort them in descending order,
786 // keep accumulating their total samples unitl it exceeds the
787 // percentage_threshold(cut-off) of total profile samples, the profile-density
788 // is the last(minimum) function-density of the processed functions, which means
789 // all the functions hot to perf are on good density if the profile-density is
790 // good. The percentage_threshold(--profile-density-cutoff-hot) is configurable
791 // depending on how much regression the system want to tolerate.
792 double
793 ProfileGeneratorBase::calculateDensity(const SampleProfileMap &Profiles) {
794   double ProfileDensity = 0.0;
795 
796   uint64_t TotalProfileSamples = 0;
797   // A list of the function profile density and its total samples.
798   std::vector<std::pair<double, uint64_t>> FuncDensityList;
799   for (const auto &I : Profiles) {
800     uint64_t TotalBodySamples = 0;
801     uint64_t FuncBodySize = 0;
802     calculateBodySamplesAndSize(I.second, TotalBodySamples, FuncBodySize);
803 
804     if (FuncBodySize == 0)
805       continue;
806 
807     double FuncDensity = static_cast<double>(TotalBodySamples) / FuncBodySize;
808     TotalProfileSamples += TotalBodySamples;
809     FuncDensityList.emplace_back(FuncDensity, TotalBodySamples);
810   }
811 
812   // Sorted by the density in descending order.
813   llvm::stable_sort(FuncDensityList, [&](const std::pair<double, uint64_t> &A,
814                                          const std::pair<double, uint64_t> &B) {
815     if (A.first != B.first)
816       return A.first > B.first;
817     return A.second < B.second;
818   });
819 
820   uint64_t AccumulatedSamples = 0;
821   uint32_t I = 0;
822   assert(ProfileDensityCutOffHot <= 1000000 &&
823          "The cutoff value is greater than 1000000(100%)");
824   while (AccumulatedSamples < TotalProfileSamples *
825                                   static_cast<float>(ProfileDensityCutOffHot) /
826                                   1000000 &&
827          I < FuncDensityList.size()) {
828     AccumulatedSamples += FuncDensityList[I].second;
829     ProfileDensity = FuncDensityList[I].first;
830     I++;
831   }
832 
833   return ProfileDensity;
834 }
835 
836 void ProfileGeneratorBase::calculateAndShowDensity(
837     const SampleProfileMap &Profiles) {
838   double Density = calculateDensity(Profiles);
839   showDensitySuggestion(Density);
840 }
841 
842 FunctionSamples *
843 CSProfileGenerator::getOrCreateFunctionSamples(ContextTrieNode *ContextNode,
844                                                bool WasLeafInlined) {
845   FunctionSamples *FProfile = ContextNode->getFunctionSamples();
846   if (!FProfile) {
847     FSamplesList.emplace_back();
848     FProfile = &FSamplesList.back();
849     FProfile->setFunction(ContextNode->getFuncName());
850     ContextNode->setFunctionSamples(FProfile);
851   }
852   // Update ContextWasInlined attribute for existing contexts.
853   // The current function can be called in two ways:
854   //  - when processing a probe of the current frame
855   //  - when processing the entry probe of an inlinee's frame, which
856   //    is then used to update the callsite count of the current frame.
857   // The two can happen in any order, hence here we are making sure
858   // `ContextWasInlined` is always set as expected.
859   // TODO: Note that the former does not always happen if no probes of the
860   // current frame has samples, and if the latter happens, we could lose the
861   // attribute. This should be fixed.
862   if (WasLeafInlined)
863     FProfile->getContext().setAttribute(ContextWasInlined);
864   return FProfile;
865 }
866 
867 ContextTrieNode *
868 CSProfileGenerator::getOrCreateContextNode(const SampleContextFrames Context,
869                                            bool WasLeafInlined) {
870   ContextTrieNode *ContextNode =
871       ContextTracker.getOrCreateContextPath(Context, true);
872   getOrCreateFunctionSamples(ContextNode, WasLeafInlined);
873   return ContextNode;
874 }
875 
876 void CSProfileGenerator::generateProfile() {
877   FunctionSamples::ProfileIsCS = true;
878 
879   collectProfiledFunctions();
880 
881   if (Binary->usePseudoProbes()) {
882     Binary->decodePseudoProbe();
883     if (InferMissingFrames)
884       initializeMissingFrameInferrer();
885   }
886 
887   if (SampleCounters) {
888     if (Binary->usePseudoProbes()) {
889       generateProbeBasedProfile();
890     } else {
891       generateLineNumBasedProfile();
892     }
893   }
894 
895   if (Binary->getTrackFuncContextSize())
896     computeSizeForProfiledFunctions();
897 
898   postProcessProfiles();
899 }
900 
901 void CSProfileGenerator::initializeMissingFrameInferrer() {
902   Binary->getMissingContextInferrer()->initialize(SampleCounters);
903 }
904 
905 void CSProfileGenerator::inferMissingFrames(
906     const SmallVectorImpl<uint64_t> &Context,
907     SmallVectorImpl<uint64_t> &NewContext) {
908   Binary->inferMissingFrames(Context, NewContext);
909 }
910 
911 void CSProfileGenerator::computeSizeForProfiledFunctions() {
912   for (auto *Func : Binary->getProfiledFunctions())
913     Binary->computeInlinedContextSizeForFunc(Func);
914 
915   // Flush the symbolizer to save memory.
916   Binary->flushSymbolizer();
917 }
918 
919 void CSProfileGenerator::updateFunctionSamples() {
920   for (auto *Node : ContextTracker) {
921     FunctionSamples *FSamples = Node->getFunctionSamples();
922     if (FSamples) {
923       if (UpdateTotalSamples)
924         FSamples->updateTotalSamples();
925       FSamples->updateCallsiteSamples();
926     }
927   }
928 }
929 
930 void CSProfileGenerator::generateLineNumBasedProfile() {
931   for (const auto &CI : *SampleCounters) {
932     const auto *CtxKey = cast<StringBasedCtxKey>(CI.first.getPtr());
933 
934     ContextTrieNode *ContextNode = &getRootContext();
935     // Sample context will be empty if the jump is an external-to-internal call
936     // pattern, the head samples should be added for the internal function.
937     if (!CtxKey->Context.empty()) {
938       // Get or create function profile for the range
939       ContextNode =
940           getOrCreateContextNode(CtxKey->Context, CtxKey->WasLeafInlined);
941       // Fill in function body samples
942       populateBodySamplesForFunction(*ContextNode->getFunctionSamples(),
943                                      CI.second.RangeCounter);
944     }
945     // Fill in boundary sample counts as well as call site samples for calls
946     populateBoundarySamplesForFunction(ContextNode, CI.second.BranchCounter);
947   }
948   // Fill in call site value sample for inlined calls and also use context to
949   // infer missing samples. Since we don't have call count for inlined
950   // functions, we estimate it from inlinee's profile using the entry of the
951   // body sample.
952   populateInferredFunctionSamples(getRootContext());
953 
954   updateFunctionSamples();
955 }
956 
957 void CSProfileGenerator::populateBodySamplesForFunction(
958     FunctionSamples &FunctionProfile, const RangeSample &RangeCounter) {
959   // Compute disjoint ranges first, so we can use MAX
960   // for calculating count for each location.
961   RangeSample Ranges;
962   findDisjointRanges(Ranges, RangeCounter);
963   for (const auto &Range : Ranges) {
964     uint64_t RangeBegin = Range.first.first;
965     uint64_t RangeEnd = Range.first.second;
966     uint64_t Count = Range.second;
967     // Disjoint ranges have introduce zero-filled gap that
968     // doesn't belong to current context, filter them out.
969     if (Count == 0)
970       continue;
971 
972     InstructionPointer IP(Binary, RangeBegin, true);
973     // Disjoint ranges may have range in the middle of two instr,
974     // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
975     // can be Addr1+1 to Addr2-1. We should ignore such range.
976     if (IP.Address > RangeEnd)
977       continue;
978 
979     do {
980       auto LeafLoc = Binary->getInlineLeafFrameLoc(IP.Address);
981       if (LeafLoc) {
982         // Recording body sample for this specific context
983         updateBodySamplesforFunctionProfile(FunctionProfile, *LeafLoc, Count);
984         FunctionProfile.addTotalSamples(Count);
985       }
986     } while (IP.advance() && IP.Address <= RangeEnd);
987   }
988 }
989 
990 void CSProfileGenerator::populateBoundarySamplesForFunction(
991     ContextTrieNode *Node, const BranchSample &BranchCounters) {
992 
993   for (const auto &Entry : BranchCounters) {
994     uint64_t SourceAddress = Entry.first.first;
995     uint64_t TargetAddress = Entry.first.second;
996     uint64_t Count = Entry.second;
997     assert(Count != 0 && "Unexpected zero weight branch");
998 
999     StringRef CalleeName = getCalleeNameForAddress(TargetAddress);
1000     if (CalleeName.size() == 0)
1001       continue;
1002 
1003     ContextTrieNode *CallerNode = Node;
1004     LineLocation CalleeCallSite(0, 0);
1005     if (CallerNode != &getRootContext()) {
1006       // Record called target sample and its count
1007       auto LeafLoc = Binary->getInlineLeafFrameLoc(SourceAddress);
1008       if (LeafLoc) {
1009         CallerNode->getFunctionSamples()->addCalledTargetSamples(
1010             LeafLoc->Location.LineOffset,
1011             getBaseDiscriminator(LeafLoc->Location.Discriminator),
1012             FunctionId(CalleeName),
1013             Count);
1014         // Record head sample for called target(callee)
1015         CalleeCallSite = LeafLoc->Location;
1016       }
1017     }
1018 
1019     ContextTrieNode *CalleeNode =
1020         CallerNode->getOrCreateChildContext(CalleeCallSite,
1021                                             FunctionId(CalleeName));
1022     FunctionSamples *CalleeProfile = getOrCreateFunctionSamples(CalleeNode);
1023     CalleeProfile->addHeadSamples(Count);
1024   }
1025 }
1026 
1027 void CSProfileGenerator::populateInferredFunctionSamples(
1028     ContextTrieNode &Node) {
1029   // There is no call jmp sample between the inliner and inlinee, we need to use
1030   // the inlinee's context to infer inliner's context, i.e. parent(inliner)'s
1031   // sample depends on child(inlinee)'s sample, so traverse the tree in
1032   // post-order.
1033   for (auto &It : Node.getAllChildContext())
1034     populateInferredFunctionSamples(It.second);
1035 
1036   FunctionSamples *CalleeProfile = Node.getFunctionSamples();
1037   if (!CalleeProfile)
1038     return;
1039   // If we already have head sample counts, we must have value profile
1040   // for call sites added already. Skip to avoid double counting.
1041   if (CalleeProfile->getHeadSamples())
1042     return;
1043   ContextTrieNode *CallerNode = Node.getParentContext();
1044   // If we don't have context, nothing to do for caller's call site.
1045   // This could happen for entry point function.
1046   if (CallerNode == &getRootContext())
1047     return;
1048 
1049   LineLocation CallerLeafFrameLoc = Node.getCallSiteLoc();
1050   FunctionSamples &CallerProfile = *getOrCreateFunctionSamples(CallerNode);
1051   // Since we don't have call count for inlined functions, we
1052   // estimate it from inlinee's profile using entry body sample.
1053   uint64_t EstimatedCallCount = CalleeProfile->getHeadSamplesEstimate();
1054   // If we don't have samples with location, use 1 to indicate live.
1055   if (!EstimatedCallCount && !CalleeProfile->getBodySamples().size())
1056     EstimatedCallCount = 1;
1057   CallerProfile.addCalledTargetSamples(CallerLeafFrameLoc.LineOffset,
1058                                        CallerLeafFrameLoc.Discriminator,
1059                                        Node.getFuncName(), EstimatedCallCount);
1060   CallerProfile.addBodySamples(CallerLeafFrameLoc.LineOffset,
1061                                CallerLeafFrameLoc.Discriminator,
1062                                EstimatedCallCount);
1063   CallerProfile.addTotalSamples(EstimatedCallCount);
1064 }
1065 
1066 void CSProfileGenerator::convertToProfileMap(
1067     ContextTrieNode &Node, SampleContextFrameVector &Context) {
1068   FunctionSamples *FProfile = Node.getFunctionSamples();
1069   if (FProfile) {
1070     Context.emplace_back(Node.getFuncName(), LineLocation(0, 0));
1071     // Save the new context for future references.
1072     SampleContextFrames NewContext = *Contexts.insert(Context).first;
1073     auto Ret = ProfileMap.emplace(NewContext, std::move(*FProfile));
1074     FunctionSamples &NewProfile = Ret.first->second;
1075     NewProfile.getContext().setContext(NewContext);
1076     Context.pop_back();
1077   }
1078 
1079   for (auto &It : Node.getAllChildContext()) {
1080     ContextTrieNode &ChildNode = It.second;
1081     Context.emplace_back(Node.getFuncName(), ChildNode.getCallSiteLoc());
1082     convertToProfileMap(ChildNode, Context);
1083     Context.pop_back();
1084   }
1085 }
1086 
1087 void CSProfileGenerator::convertToProfileMap() {
1088   assert(ProfileMap.empty() &&
1089          "ProfileMap should be empty before converting from the trie");
1090   assert(IsProfileValidOnTrie &&
1091          "Do not convert the trie twice, it's already destroyed");
1092 
1093   SampleContextFrameVector Context;
1094   for (auto &It : getRootContext().getAllChildContext())
1095     convertToProfileMap(It.second, Context);
1096 
1097   IsProfileValidOnTrie = false;
1098 }
1099 
1100 void CSProfileGenerator::postProcessProfiles() {
1101   // Compute hot/cold threshold based on profile. This will be used for cold
1102   // context profile merging/trimming.
1103   computeSummaryAndThreshold();
1104 
1105   // Run global pre-inliner to adjust/merge context profile based on estimated
1106   // inline decisions.
1107   if (EnableCSPreInliner) {
1108     ContextTracker.populateFuncToCtxtMap();
1109     CSPreInliner(ContextTracker, *Binary, Summary.get()).run();
1110     // Turn off the profile merger by default unless it is explicitly enabled.
1111     if (!CSProfMergeColdContext.getNumOccurrences())
1112       CSProfMergeColdContext = false;
1113   }
1114 
1115   convertToProfileMap();
1116 
1117   // Trim and merge cold context profile using cold threshold above.
1118   if (TrimColdProfile || CSProfMergeColdContext) {
1119     SampleContextTrimmer(ProfileMap)
1120         .trimAndMergeColdContextProfiles(
1121             HotCountThreshold, TrimColdProfile, CSProfMergeColdContext,
1122             CSProfMaxColdContextDepth, EnableCSPreInliner);
1123   }
1124 
1125   if (GenCSNestedProfile) {
1126     ProfileConverter CSConverter(ProfileMap);
1127     CSConverter.convertCSProfiles();
1128     FunctionSamples::ProfileIsCS = false;
1129   }
1130   filterAmbiguousProfile(ProfileMap);
1131   ProfileGeneratorBase::calculateAndShowDensity(ProfileMap);
1132 }
1133 
1134 void ProfileGeneratorBase::computeSummaryAndThreshold(
1135     SampleProfileMap &Profiles) {
1136   SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs);
1137   Summary = Builder.computeSummaryForProfiles(Profiles);
1138   HotCountThreshold = ProfileSummaryBuilder::getHotCountThreshold(
1139       (Summary->getDetailedSummary()));
1140   ColdCountThreshold = ProfileSummaryBuilder::getColdCountThreshold(
1141       (Summary->getDetailedSummary()));
1142 }
1143 
1144 void CSProfileGenerator::computeSummaryAndThreshold() {
1145   // Always merge and use context-less profile map to compute summary.
1146   SampleProfileMap ContextLessProfiles;
1147   ContextTracker.createContextLessProfileMap(ContextLessProfiles);
1148 
1149   // Set the flag below to avoid merging the profile again in
1150   // computeSummaryAndThreshold
1151   FunctionSamples::ProfileIsCS = false;
1152   assert(
1153       (!UseContextLessSummary.getNumOccurrences() || UseContextLessSummary) &&
1154       "Don't set --profile-summary-contextless to false for profile "
1155       "generation");
1156   ProfileGeneratorBase::computeSummaryAndThreshold(ContextLessProfiles);
1157   // Recover the old value.
1158   FunctionSamples::ProfileIsCS = true;
1159 }
1160 
1161 void ProfileGeneratorBase::extractProbesFromRange(
1162     const RangeSample &RangeCounter, ProbeCounterMap &ProbeCounter,
1163     bool FindDisjointRanges) {
1164   const RangeSample *PRanges = &RangeCounter;
1165   RangeSample Ranges;
1166   if (FindDisjointRanges) {
1167     findDisjointRanges(Ranges, RangeCounter);
1168     PRanges = &Ranges;
1169   }
1170 
1171   for (const auto &Range : *PRanges) {
1172     uint64_t RangeBegin = Range.first.first;
1173     uint64_t RangeEnd = Range.first.second;
1174     uint64_t Count = Range.second;
1175 
1176     InstructionPointer IP(Binary, RangeBegin, true);
1177     // Disjoint ranges may have range in the middle of two instr,
1178     // e.g. If Instr1 at Addr1, and Instr2 at Addr2, disjoint range
1179     // can be Addr1+1 to Addr2-1. We should ignore such range.
1180     if (IP.Address > RangeEnd)
1181       continue;
1182 
1183     do {
1184       const AddressProbesMap &Address2ProbesMap =
1185           Binary->getAddress2ProbesMap();
1186       for (const MCDecodedPseudoProbe &Probe :
1187            Address2ProbesMap.find(IP.Address)) {
1188         ProbeCounter[&Probe] += Count;
1189       }
1190     } while (IP.advance() && IP.Address <= RangeEnd);
1191   }
1192 }
1193 
1194 static void extractPrefixContextStack(SampleContextFrameVector &ContextStack,
1195                                       const SmallVectorImpl<uint64_t> &AddrVec,
1196                                       ProfiledBinary *Binary) {
1197   SmallVector<const MCDecodedPseudoProbe *, 16> Probes;
1198   for (auto Address : reverse(AddrVec)) {
1199     const MCDecodedPseudoProbe *CallProbe =
1200         Binary->getCallProbeForAddr(Address);
1201     // These could be the cases when a probe is not found at a calliste. Cutting
1202     // off the context from here since the inliner will not know how to consume
1203     // a context with unknown callsites.
1204     // 1. for functions that are not sampled when
1205     // --decode-probe-for-profiled-functions-only is on.
1206     // 2. for a merged callsite. Callsite merging may cause the loss of original
1207     // probe IDs.
1208     // 3. for an external callsite.
1209     if (!CallProbe)
1210       break;
1211     Probes.push_back(CallProbe);
1212   }
1213 
1214   std::reverse(Probes.begin(), Probes.end());
1215 
1216   // Extract context stack for reusing, leaf context stack will be added
1217   // compressed while looking up function profile.
1218   for (const auto *P : Probes) {
1219     Binary->getInlineContextForProbe(P, ContextStack, true);
1220   }
1221 }
1222 
1223 void CSProfileGenerator::generateProbeBasedProfile() {
1224   // Enable pseudo probe functionalities in SampleProf
1225   FunctionSamples::ProfileIsProbeBased = true;
1226   for (const auto &CI : *SampleCounters) {
1227     const AddrBasedCtxKey *CtxKey =
1228         dyn_cast<AddrBasedCtxKey>(CI.first.getPtr());
1229     // Fill in function body samples from probes, also infer caller's samples
1230     // from callee's probe
1231     populateBodySamplesWithProbes(CI.second.RangeCounter, CtxKey);
1232     // Fill in boundary samples for a call probe
1233     populateBoundarySamplesWithProbes(CI.second.BranchCounter, CtxKey);
1234   }
1235 }
1236 
1237 void CSProfileGenerator::populateBodySamplesWithProbes(
1238     const RangeSample &RangeCounter, const AddrBasedCtxKey *CtxKey) {
1239   ProbeCounterMap ProbeCounter;
1240   // Extract the top frame probes by looking up each address among the range in
1241   // the Address2ProbeMap
1242   extractProbesFromRange(RangeCounter, ProbeCounter);
1243   std::unordered_map<MCDecodedPseudoProbeInlineTree *,
1244                      std::unordered_set<FunctionSamples *>>
1245       FrameSamples;
1246   for (const auto &PI : ProbeCounter) {
1247     const MCDecodedPseudoProbe *Probe = PI.first;
1248     uint64_t Count = PI.second;
1249     // Disjoint ranges have introduce zero-filled gap that
1250     // doesn't belong to current context, filter them out.
1251     if (!Probe->isBlock() || Count == 0)
1252       continue;
1253 
1254     ContextTrieNode *ContextNode = getContextNodeForLeafProbe(CtxKey, Probe);
1255     FunctionSamples &FunctionProfile = *ContextNode->getFunctionSamples();
1256     // Record the current frame and FunctionProfile whenever samples are
1257     // collected for non-danglie probes. This is for reporting all of the
1258     // zero count probes of the frame later.
1259     FrameSamples[Probe->getInlineTreeNode()].insert(&FunctionProfile);
1260     FunctionProfile.addBodySamples(Probe->getIndex(), Probe->getDiscriminator(),
1261                                    Count);
1262     FunctionProfile.addTotalSamples(Count);
1263     if (Probe->isEntry()) {
1264       FunctionProfile.addHeadSamples(Count);
1265       // Look up for the caller's function profile
1266       const auto *InlinerDesc = Binary->getInlinerDescForProbe(Probe);
1267       ContextTrieNode *CallerNode = ContextNode->getParentContext();
1268       if (InlinerDesc != nullptr && CallerNode != &getRootContext()) {
1269         // Since the context id will be compressed, we have to use callee's
1270         // context id to infer caller's context id to ensure they share the
1271         // same context prefix.
1272         uint64_t CallerIndex = ContextNode->getCallSiteLoc().LineOffset;
1273         uint64_t CallerDiscriminator = ContextNode->getCallSiteLoc().Discriminator;
1274         assert(CallerIndex &&
1275                "Inferred caller's location index shouldn't be zero!");
1276         assert(!CallerDiscriminator &&
1277                "Callsite probe should not have a discriminator!");
1278         FunctionSamples &CallerProfile =
1279             *getOrCreateFunctionSamples(CallerNode);
1280         CallerProfile.setFunctionHash(InlinerDesc->FuncHash);
1281         CallerProfile.addBodySamples(CallerIndex, CallerDiscriminator, Count);
1282         CallerProfile.addTotalSamples(Count);
1283         CallerProfile.addCalledTargetSamples(CallerIndex, CallerDiscriminator,
1284                                              ContextNode->getFuncName(), Count);
1285       }
1286     }
1287   }
1288 
1289   // Assign zero count for remaining probes without sample hits to
1290   // differentiate from probes optimized away, of which the counts are unknown
1291   // and will be inferred by the compiler.
1292   for (auto &I : FrameSamples) {
1293     for (auto *FunctionProfile : I.second) {
1294       for (const MCDecodedPseudoProbe &Probe : I.first->getProbes()) {
1295         FunctionProfile->addBodySamples(Probe.getIndex(),
1296                                         Probe.getDiscriminator(), 0);
1297       }
1298     }
1299   }
1300 }
1301 
1302 void CSProfileGenerator::populateBoundarySamplesWithProbes(
1303     const BranchSample &BranchCounter, const AddrBasedCtxKey *CtxKey) {
1304   for (const auto &BI : BranchCounter) {
1305     uint64_t SourceAddress = BI.first.first;
1306     uint64_t TargetAddress = BI.first.second;
1307     uint64_t Count = BI.second;
1308     const MCDecodedPseudoProbe *CallProbe =
1309         Binary->getCallProbeForAddr(SourceAddress);
1310     if (CallProbe == nullptr)
1311       continue;
1312     FunctionSamples &FunctionProfile =
1313         getFunctionProfileForLeafProbe(CtxKey, CallProbe);
1314     FunctionProfile.addBodySamples(CallProbe->getIndex(), 0, Count);
1315     FunctionProfile.addTotalSamples(Count);
1316     StringRef CalleeName = getCalleeNameForAddress(TargetAddress);
1317     if (CalleeName.size() == 0)
1318       continue;
1319     FunctionProfile.addCalledTargetSamples(CallProbe->getIndex(),
1320                                            CallProbe->getDiscriminator(),
1321                                            FunctionId(CalleeName), Count);
1322   }
1323 }
1324 
1325 ContextTrieNode *CSProfileGenerator::getContextNodeForLeafProbe(
1326     const AddrBasedCtxKey *CtxKey, const MCDecodedPseudoProbe *LeafProbe) {
1327 
1328   const SmallVectorImpl<uint64_t> *PContext = &CtxKey->Context;
1329   SmallVector<uint64_t, 16> NewContext;
1330 
1331   if (InferMissingFrames) {
1332     SmallVector<uint64_t, 16> Context = CtxKey->Context;
1333     // Append leaf frame for a complete inference.
1334     Context.push_back(LeafProbe->getAddress());
1335     inferMissingFrames(Context, NewContext);
1336     // Pop out the leaf probe that was pushed in above.
1337     NewContext.pop_back();
1338     PContext = &NewContext;
1339   }
1340 
1341   SampleContextFrameVector ContextStack;
1342   extractPrefixContextStack(ContextStack, *PContext, Binary);
1343 
1344   // Explicitly copy the context for appending the leaf context
1345   SampleContextFrameVector NewContextStack(ContextStack.begin(),
1346                                            ContextStack.end());
1347   Binary->getInlineContextForProbe(LeafProbe, NewContextStack, true);
1348   // For leaf inlined context with the top frame, we should strip off the top
1349   // frame's probe id, like:
1350   // Inlined stack: [foo:1, bar:2], the ContextId will be "foo:1 @ bar"
1351   auto LeafFrame = NewContextStack.back();
1352   LeafFrame.Location = LineLocation(0, 0);
1353   NewContextStack.pop_back();
1354   // Compress the context string except for the leaf frame
1355   CSProfileGenerator::compressRecursionContext(NewContextStack);
1356   CSProfileGenerator::trimContext(NewContextStack);
1357   NewContextStack.push_back(LeafFrame);
1358 
1359   const auto *FuncDesc = Binary->getFuncDescForGUID(LeafProbe->getGuid());
1360   bool WasLeafInlined = LeafProbe->getInlineTreeNode()->hasInlineSite();
1361   ContextTrieNode *ContextNode =
1362       getOrCreateContextNode(NewContextStack, WasLeafInlined);
1363   ContextNode->getFunctionSamples()->setFunctionHash(FuncDesc->FuncHash);
1364   return ContextNode;
1365 }
1366 
1367 FunctionSamples &CSProfileGenerator::getFunctionProfileForLeafProbe(
1368     const AddrBasedCtxKey *CtxKey, const MCDecodedPseudoProbe *LeafProbe) {
1369   return *getContextNodeForLeafProbe(CtxKey, LeafProbe)->getFunctionSamples();
1370 }
1371 
1372 } // end namespace sampleprof
1373 } // end namespace llvm
1374