xref: /llvm-project/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp (revision c7451ffcb94a1fe53b703e9fc3204bf6cda610a9)
1 //===- MemProfiler.cpp - memory allocation and access profiler ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file is a part of MemProfiler. Memory accesses are instrumented
10 // to increment the access count held in a shadow memory location, or
11 // alternatively to call into the runtime. Memory intrinsic calls (memmove,
12 // memcpy, memset) are changed to call the memory profiling runtime version
13 // instead.
14 //
15 //===----------------------------------------------------------------------===//
16 
17 #include "llvm/Transforms/Instrumentation/MemProfiler.h"
18 #include "llvm/ADT/SmallVector.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/ADT/StringRef.h"
21 #include "llvm/Analysis/MemoryBuiltins.h"
22 #include "llvm/Analysis/MemoryProfileInfo.h"
23 #include "llvm/Analysis/TargetLibraryInfo.h"
24 #include "llvm/Analysis/ValueTracking.h"
25 #include "llvm/IR/Constant.h"
26 #include "llvm/IR/DataLayout.h"
27 #include "llvm/IR/DiagnosticInfo.h"
28 #include "llvm/IR/Function.h"
29 #include "llvm/IR/GlobalValue.h"
30 #include "llvm/IR/IRBuilder.h"
31 #include "llvm/IR/Instruction.h"
32 #include "llvm/IR/IntrinsicInst.h"
33 #include "llvm/IR/Module.h"
34 #include "llvm/IR/Type.h"
35 #include "llvm/IR/Value.h"
36 #include "llvm/ProfileData/InstrProf.h"
37 #include "llvm/ProfileData/InstrProfReader.h"
38 #include "llvm/Support/BLAKE3.h"
39 #include "llvm/Support/CommandLine.h"
40 #include "llvm/Support/Debug.h"
41 #include "llvm/Support/HashBuilder.h"
42 #include "llvm/Support/VirtualFileSystem.h"
43 #include "llvm/TargetParser/Triple.h"
44 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
45 #include "llvm/Transforms/Utils/LongestCommonSequence.h"
46 #include "llvm/Transforms/Utils/ModuleUtils.h"
47 #include <map>
48 #include <set>
49 
50 using namespace llvm;
51 using namespace llvm::memprof;
52 
53 #define DEBUG_TYPE "memprof"
54 
55 namespace llvm {
56 extern cl::opt<bool> PGOWarnMissing;
57 extern cl::opt<bool> NoPGOWarnMismatch;
58 extern cl::opt<bool> NoPGOWarnMismatchComdatWeak;
59 } // namespace llvm
60 
61 constexpr int LLVM_MEM_PROFILER_VERSION = 1;
62 
63 // Size of memory mapped to a single shadow location.
64 constexpr uint64_t DefaultMemGranularity = 64;
65 
66 // Size of memory mapped to a single histogram bucket.
67 constexpr uint64_t HistogramGranularity = 8;
68 
69 // Scale from granularity down to shadow size.
70 constexpr uint64_t DefaultShadowScale = 3;
71 
72 constexpr char MemProfModuleCtorName[] = "memprof.module_ctor";
73 constexpr uint64_t MemProfCtorAndDtorPriority = 1;
74 // On Emscripten, the system needs more than one priorities for constructors.
75 constexpr uint64_t MemProfEmscriptenCtorAndDtorPriority = 50;
76 constexpr char MemProfInitName[] = "__memprof_init";
77 constexpr char MemProfVersionCheckNamePrefix[] =
78     "__memprof_version_mismatch_check_v";
79 
80 constexpr char MemProfShadowMemoryDynamicAddress[] =
81     "__memprof_shadow_memory_dynamic_address";
82 
83 constexpr char MemProfFilenameVar[] = "__memprof_profile_filename";
84 
85 constexpr char MemProfHistogramFlagVar[] = "__memprof_histogram";
86 
87 // Command-line flags.
88 
89 static cl::opt<bool> ClInsertVersionCheck(
90     "memprof-guard-against-version-mismatch",
91     cl::desc("Guard against compiler/runtime version mismatch."), cl::Hidden,
92     cl::init(true));
93 
94 // This flag may need to be replaced with -f[no-]memprof-reads.
95 static cl::opt<bool> ClInstrumentReads("memprof-instrument-reads",
96                                        cl::desc("instrument read instructions"),
97                                        cl::Hidden, cl::init(true));
98 
99 static cl::opt<bool>
100     ClInstrumentWrites("memprof-instrument-writes",
101                        cl::desc("instrument write instructions"), cl::Hidden,
102                        cl::init(true));
103 
104 static cl::opt<bool> ClInstrumentAtomics(
105     "memprof-instrument-atomics",
106     cl::desc("instrument atomic instructions (rmw, cmpxchg)"), cl::Hidden,
107     cl::init(true));
108 
109 static cl::opt<bool> ClUseCalls(
110     "memprof-use-callbacks",
111     cl::desc("Use callbacks instead of inline instrumentation sequences."),
112     cl::Hidden, cl::init(false));
113 
114 static cl::opt<std::string>
115     ClMemoryAccessCallbackPrefix("memprof-memory-access-callback-prefix",
116                                  cl::desc("Prefix for memory access callbacks"),
117                                  cl::Hidden, cl::init("__memprof_"));
118 
119 // These flags allow to change the shadow mapping.
120 // The shadow mapping looks like
121 //    Shadow = ((Mem & mask) >> scale) + offset
122 
123 static cl::opt<int> ClMappingScale("memprof-mapping-scale",
124                                    cl::desc("scale of memprof shadow mapping"),
125                                    cl::Hidden, cl::init(DefaultShadowScale));
126 
127 static cl::opt<int>
128     ClMappingGranularity("memprof-mapping-granularity",
129                          cl::desc("granularity of memprof shadow mapping"),
130                          cl::Hidden, cl::init(DefaultMemGranularity));
131 
132 static cl::opt<bool> ClStack("memprof-instrument-stack",
133                              cl::desc("Instrument scalar stack variables"),
134                              cl::Hidden, cl::init(false));
135 
136 // Debug flags.
137 
138 static cl::opt<int> ClDebug("memprof-debug", cl::desc("debug"), cl::Hidden,
139                             cl::init(0));
140 
141 static cl::opt<std::string> ClDebugFunc("memprof-debug-func", cl::Hidden,
142                                         cl::desc("Debug func"));
143 
144 static cl::opt<int> ClDebugMin("memprof-debug-min", cl::desc("Debug min inst"),
145                                cl::Hidden, cl::init(-1));
146 
147 static cl::opt<int> ClDebugMax("memprof-debug-max", cl::desc("Debug max inst"),
148                                cl::Hidden, cl::init(-1));
149 
150 // By default disable matching of allocation profiles onto operator new that
151 // already explicitly pass a hot/cold hint, since we don't currently
152 // override these hints anyway.
153 static cl::opt<bool> ClMemProfMatchHotColdNew(
154     "memprof-match-hot-cold-new",
155     cl::desc(
156         "Match allocation profiles onto existing hot/cold operator new calls"),
157     cl::Hidden, cl::init(false));
158 
159 static cl::opt<bool> ClHistogram("memprof-histogram",
160                                  cl::desc("Collect access count histograms"),
161                                  cl::Hidden, cl::init(false));
162 
163 static cl::opt<bool>
164     ClPrintMemProfMatchInfo("memprof-print-match-info",
165                             cl::desc("Print matching stats for each allocation "
166                                      "context in this module's profiles"),
167                             cl::Hidden, cl::init(false));
168 
169 static cl::opt<std::string>
170     MemprofRuntimeDefaultOptions("memprof-runtime-default-options",
171                                  cl::desc("The default memprof options"),
172                                  cl::Hidden, cl::init(""));
173 
174 static cl::opt<bool>
175     SalvageStaleProfile("memprof-salvage-stale-profile",
176                         cl::desc("Salvage stale MemProf profile"),
177                         cl::init(false), cl::Hidden);
178 
179 cl::opt<unsigned> MinClonedColdBytePercent(
180     "memprof-cloning-cold-threshold", cl::init(100), cl::Hidden,
181     cl::desc("Min percent of cold bytes to hint alloc cold during cloning"));
182 
183 extern cl::opt<bool> MemProfReportHintedSizes;
184 
185 static cl::opt<unsigned> MinMatchedColdBytePercent(
186     "memprof-matching-cold-threshold", cl::init(100), cl::Hidden,
187     cl::desc("Min percent of cold bytes matched to hint allocation cold"));
188 
189 // Instrumentation statistics
190 STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
191 STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
192 STATISTIC(NumSkippedStackReads, "Number of non-instrumented stack reads");
193 STATISTIC(NumSkippedStackWrites, "Number of non-instrumented stack writes");
194 
195 // Matching statistics
196 STATISTIC(NumOfMemProfMissing, "Number of functions without memory profile.");
197 STATISTIC(NumOfMemProfMismatch,
198           "Number of functions having mismatched memory profile hash.");
199 STATISTIC(NumOfMemProfFunc, "Number of functions having valid memory profile.");
200 STATISTIC(NumOfMemProfAllocContextProfiles,
201           "Number of alloc contexts in memory profile.");
202 STATISTIC(NumOfMemProfCallSiteProfiles,
203           "Number of callsites in memory profile.");
204 STATISTIC(NumOfMemProfMatchedAllocContexts,
205           "Number of matched memory profile alloc contexts.");
206 STATISTIC(NumOfMemProfMatchedAllocs,
207           "Number of matched memory profile allocs.");
208 STATISTIC(NumOfMemProfMatchedCallSites,
209           "Number of matched memory profile callsites.");
210 
211 namespace {
212 
213 /// This struct defines the shadow mapping using the rule:
214 ///   shadow = ((mem & mask) >> Scale) ADD DynamicShadowOffset.
215 struct ShadowMapping {
216   ShadowMapping() {
217     Scale = ClMappingScale;
218     Granularity = ClHistogram ? HistogramGranularity : ClMappingGranularity;
219     Mask = ~(Granularity - 1);
220   }
221 
222   int Scale;
223   int Granularity;
224   uint64_t Mask; // Computed as ~(Granularity-1)
225 };
226 
227 static uint64_t getCtorAndDtorPriority(Triple &TargetTriple) {
228   return TargetTriple.isOSEmscripten() ? MemProfEmscriptenCtorAndDtorPriority
229                                        : MemProfCtorAndDtorPriority;
230 }
231 
232 struct InterestingMemoryAccess {
233   Value *Addr = nullptr;
234   bool IsWrite;
235   Type *AccessTy;
236   Value *MaybeMask = nullptr;
237 };
238 
239 /// Instrument the code in module to profile memory accesses.
240 class MemProfiler {
241 public:
242   MemProfiler(Module &M) {
243     C = &(M.getContext());
244     LongSize = M.getDataLayout().getPointerSizeInBits();
245     IntptrTy = Type::getIntNTy(*C, LongSize);
246     PtrTy = PointerType::getUnqual(*C);
247   }
248 
249   /// If it is an interesting memory access, populate information
250   /// about the access and return a InterestingMemoryAccess struct.
251   /// Otherwise return std::nullopt.
252   std::optional<InterestingMemoryAccess>
253   isInterestingMemoryAccess(Instruction *I) const;
254 
255   void instrumentMop(Instruction *I, const DataLayout &DL,
256                      InterestingMemoryAccess &Access);
257   void instrumentAddress(Instruction *OrigIns, Instruction *InsertBefore,
258                          Value *Addr, bool IsWrite);
259   void instrumentMaskedLoadOrStore(const DataLayout &DL, Value *Mask,
260                                    Instruction *I, Value *Addr, Type *AccessTy,
261                                    bool IsWrite);
262   void instrumentMemIntrinsic(MemIntrinsic *MI);
263   Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
264   bool instrumentFunction(Function &F);
265   bool maybeInsertMemProfInitAtFunctionEntry(Function &F);
266   bool insertDynamicShadowAtFunctionEntry(Function &F);
267 
268 private:
269   void initializeCallbacks(Module &M);
270 
271   LLVMContext *C;
272   int LongSize;
273   Type *IntptrTy;
274   PointerType *PtrTy;
275   ShadowMapping Mapping;
276 
277   // These arrays is indexed by AccessIsWrite
278   FunctionCallee MemProfMemoryAccessCallback[2];
279 
280   FunctionCallee MemProfMemmove, MemProfMemcpy, MemProfMemset;
281   Value *DynamicShadowOffset = nullptr;
282 };
283 
284 class ModuleMemProfiler {
285 public:
286   ModuleMemProfiler(Module &M) { TargetTriple = Triple(M.getTargetTriple()); }
287 
288   bool instrumentModule(Module &);
289 
290 private:
291   Triple TargetTriple;
292   ShadowMapping Mapping;
293   Function *MemProfCtorFunction = nullptr;
294 };
295 
296 } // end anonymous namespace
297 
298 MemProfilerPass::MemProfilerPass() = default;
299 
300 PreservedAnalyses MemProfilerPass::run(Function &F,
301                                        AnalysisManager<Function> &AM) {
302   assert((!ClHistogram || ClMappingGranularity == DefaultMemGranularity) &&
303          "Memprof with histogram only supports default mapping granularity");
304   Module &M = *F.getParent();
305   MemProfiler Profiler(M);
306   if (Profiler.instrumentFunction(F))
307     return PreservedAnalyses::none();
308   return PreservedAnalyses::all();
309 }
310 
311 ModuleMemProfilerPass::ModuleMemProfilerPass() = default;
312 
313 PreservedAnalyses ModuleMemProfilerPass::run(Module &M,
314                                              AnalysisManager<Module> &AM) {
315 
316   ModuleMemProfiler Profiler(M);
317   if (Profiler.instrumentModule(M))
318     return PreservedAnalyses::none();
319   return PreservedAnalyses::all();
320 }
321 
322 Value *MemProfiler::memToShadow(Value *Shadow, IRBuilder<> &IRB) {
323   // (Shadow & mask) >> scale
324   Shadow = IRB.CreateAnd(Shadow, Mapping.Mask);
325   Shadow = IRB.CreateLShr(Shadow, Mapping.Scale);
326   // (Shadow >> scale) | offset
327   assert(DynamicShadowOffset);
328   return IRB.CreateAdd(Shadow, DynamicShadowOffset);
329 }
330 
331 // Instrument memset/memmove/memcpy
332 void MemProfiler::instrumentMemIntrinsic(MemIntrinsic *MI) {
333   IRBuilder<> IRB(MI);
334   if (isa<MemTransferInst>(MI)) {
335     IRB.CreateCall(isa<MemMoveInst>(MI) ? MemProfMemmove : MemProfMemcpy,
336                    {MI->getOperand(0), MI->getOperand(1),
337                     IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
338   } else if (isa<MemSetInst>(MI)) {
339     IRB.CreateCall(
340         MemProfMemset,
341         {MI->getOperand(0),
342          IRB.CreateIntCast(MI->getOperand(1), IRB.getInt32Ty(), false),
343          IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
344   }
345   MI->eraseFromParent();
346 }
347 
348 std::optional<InterestingMemoryAccess>
349 MemProfiler::isInterestingMemoryAccess(Instruction *I) const {
350   // Do not instrument the load fetching the dynamic shadow address.
351   if (DynamicShadowOffset == I)
352     return std::nullopt;
353 
354   InterestingMemoryAccess Access;
355 
356   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
357     if (!ClInstrumentReads)
358       return std::nullopt;
359     Access.IsWrite = false;
360     Access.AccessTy = LI->getType();
361     Access.Addr = LI->getPointerOperand();
362   } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
363     if (!ClInstrumentWrites)
364       return std::nullopt;
365     Access.IsWrite = true;
366     Access.AccessTy = SI->getValueOperand()->getType();
367     Access.Addr = SI->getPointerOperand();
368   } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
369     if (!ClInstrumentAtomics)
370       return std::nullopt;
371     Access.IsWrite = true;
372     Access.AccessTy = RMW->getValOperand()->getType();
373     Access.Addr = RMW->getPointerOperand();
374   } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
375     if (!ClInstrumentAtomics)
376       return std::nullopt;
377     Access.IsWrite = true;
378     Access.AccessTy = XCHG->getCompareOperand()->getType();
379     Access.Addr = XCHG->getPointerOperand();
380   } else if (auto *CI = dyn_cast<CallInst>(I)) {
381     auto *F = CI->getCalledFunction();
382     if (F && (F->getIntrinsicID() == Intrinsic::masked_load ||
383               F->getIntrinsicID() == Intrinsic::masked_store)) {
384       unsigned OpOffset = 0;
385       if (F->getIntrinsicID() == Intrinsic::masked_store) {
386         if (!ClInstrumentWrites)
387           return std::nullopt;
388         // Masked store has an initial operand for the value.
389         OpOffset = 1;
390         Access.AccessTy = CI->getArgOperand(0)->getType();
391         Access.IsWrite = true;
392       } else {
393         if (!ClInstrumentReads)
394           return std::nullopt;
395         Access.AccessTy = CI->getType();
396         Access.IsWrite = false;
397       }
398 
399       auto *BasePtr = CI->getOperand(0 + OpOffset);
400       Access.MaybeMask = CI->getOperand(2 + OpOffset);
401       Access.Addr = BasePtr;
402     }
403   }
404 
405   if (!Access.Addr)
406     return std::nullopt;
407 
408   // Do not instrument accesses from different address spaces; we cannot deal
409   // with them.
410   Type *PtrTy = cast<PointerType>(Access.Addr->getType()->getScalarType());
411   if (PtrTy->getPointerAddressSpace() != 0)
412     return std::nullopt;
413 
414   // Ignore swifterror addresses.
415   // swifterror memory addresses are mem2reg promoted by instruction
416   // selection. As such they cannot have regular uses like an instrumentation
417   // function and it makes no sense to track them as memory.
418   if (Access.Addr->isSwiftError())
419     return std::nullopt;
420 
421   // Peel off GEPs and BitCasts.
422   auto *Addr = Access.Addr->stripInBoundsOffsets();
423 
424   if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
425     // Do not instrument PGO counter updates.
426     if (GV->hasSection()) {
427       StringRef SectionName = GV->getSection();
428       // Check if the global is in the PGO counters section.
429       auto OF = Triple(I->getModule()->getTargetTriple()).getObjectFormat();
430       if (SectionName.ends_with(
431               getInstrProfSectionName(IPSK_cnts, OF, /*AddSegmentInfo=*/false)))
432         return std::nullopt;
433     }
434 
435     // Do not instrument accesses to LLVM internal variables.
436     if (GV->getName().starts_with("__llvm"))
437       return std::nullopt;
438   }
439 
440   return Access;
441 }
442 
443 void MemProfiler::instrumentMaskedLoadOrStore(const DataLayout &DL, Value *Mask,
444                                               Instruction *I, Value *Addr,
445                                               Type *AccessTy, bool IsWrite) {
446   auto *VTy = cast<FixedVectorType>(AccessTy);
447   unsigned Num = VTy->getNumElements();
448   auto *Zero = ConstantInt::get(IntptrTy, 0);
449   for (unsigned Idx = 0; Idx < Num; ++Idx) {
450     Value *InstrumentedAddress = nullptr;
451     Instruction *InsertBefore = I;
452     if (auto *Vector = dyn_cast<ConstantVector>(Mask)) {
453       // dyn_cast as we might get UndefValue
454       if (auto *Masked = dyn_cast<ConstantInt>(Vector->getOperand(Idx))) {
455         if (Masked->isZero())
456           // Mask is constant false, so no instrumentation needed.
457           continue;
458         // If we have a true or undef value, fall through to instrumentAddress.
459         // with InsertBefore == I
460       }
461     } else {
462       IRBuilder<> IRB(I);
463       Value *MaskElem = IRB.CreateExtractElement(Mask, Idx);
464       Instruction *ThenTerm = SplitBlockAndInsertIfThen(MaskElem, I, false);
465       InsertBefore = ThenTerm;
466     }
467 
468     IRBuilder<> IRB(InsertBefore);
469     InstrumentedAddress =
470         IRB.CreateGEP(VTy, Addr, {Zero, ConstantInt::get(IntptrTy, Idx)});
471     instrumentAddress(I, InsertBefore, InstrumentedAddress, IsWrite);
472   }
473 }
474 
475 void MemProfiler::instrumentMop(Instruction *I, const DataLayout &DL,
476                                 InterestingMemoryAccess &Access) {
477   // Skip instrumentation of stack accesses unless requested.
478   if (!ClStack && isa<AllocaInst>(getUnderlyingObject(Access.Addr))) {
479     if (Access.IsWrite)
480       ++NumSkippedStackWrites;
481     else
482       ++NumSkippedStackReads;
483     return;
484   }
485 
486   if (Access.IsWrite)
487     NumInstrumentedWrites++;
488   else
489     NumInstrumentedReads++;
490 
491   if (Access.MaybeMask) {
492     instrumentMaskedLoadOrStore(DL, Access.MaybeMask, I, Access.Addr,
493                                 Access.AccessTy, Access.IsWrite);
494   } else {
495     // Since the access counts will be accumulated across the entire allocation,
496     // we only update the shadow access count for the first location and thus
497     // don't need to worry about alignment and type size.
498     instrumentAddress(I, I, Access.Addr, Access.IsWrite);
499   }
500 }
501 
502 void MemProfiler::instrumentAddress(Instruction *OrigIns,
503                                     Instruction *InsertBefore, Value *Addr,
504                                     bool IsWrite) {
505   IRBuilder<> IRB(InsertBefore);
506   Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
507 
508   if (ClUseCalls) {
509     IRB.CreateCall(MemProfMemoryAccessCallback[IsWrite], AddrLong);
510     return;
511   }
512 
513   Type *ShadowTy = ClHistogram ? Type::getInt8Ty(*C) : Type::getInt64Ty(*C);
514   Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
515 
516   Value *ShadowPtr = memToShadow(AddrLong, IRB);
517   Value *ShadowAddr = IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy);
518   Value *ShadowValue = IRB.CreateLoad(ShadowTy, ShadowAddr);
519   // If we are profiling with histograms, add overflow protection at 255.
520   if (ClHistogram) {
521     Value *MaxCount = ConstantInt::get(Type::getInt8Ty(*C), 255);
522     Value *Cmp = IRB.CreateICmpULT(ShadowValue, MaxCount);
523     Instruction *IncBlock =
524         SplitBlockAndInsertIfThen(Cmp, InsertBefore, /*Unreachable=*/false);
525     IRB.SetInsertPoint(IncBlock);
526   }
527   Value *Inc = ConstantInt::get(ShadowTy, 1);
528   ShadowValue = IRB.CreateAdd(ShadowValue, Inc);
529   IRB.CreateStore(ShadowValue, ShadowAddr);
530 }
531 
532 // Create the variable for the profile file name.
533 void createProfileFileNameVar(Module &M) {
534   const MDString *MemProfFilename =
535       dyn_cast_or_null<MDString>(M.getModuleFlag("MemProfProfileFilename"));
536   if (!MemProfFilename)
537     return;
538   assert(!MemProfFilename->getString().empty() &&
539          "Unexpected MemProfProfileFilename metadata with empty string");
540   Constant *ProfileNameConst = ConstantDataArray::getString(
541       M.getContext(), MemProfFilename->getString(), true);
542   GlobalVariable *ProfileNameVar = new GlobalVariable(
543       M, ProfileNameConst->getType(), /*isConstant=*/true,
544       GlobalValue::WeakAnyLinkage, ProfileNameConst, MemProfFilenameVar);
545   Triple TT(M.getTargetTriple());
546   if (TT.supportsCOMDAT()) {
547     ProfileNameVar->setLinkage(GlobalValue::ExternalLinkage);
548     ProfileNameVar->setComdat(M.getOrInsertComdat(MemProfFilenameVar));
549   }
550 }
551 
552 // Set MemprofHistogramFlag as a Global veriable in IR. This makes it accessible
553 // to the runtime, changing shadow count behavior.
554 void createMemprofHistogramFlagVar(Module &M) {
555   const StringRef VarName(MemProfHistogramFlagVar);
556   Type *IntTy1 = Type::getInt1Ty(M.getContext());
557   auto MemprofHistogramFlag = new GlobalVariable(
558       M, IntTy1, true, GlobalValue::WeakAnyLinkage,
559       Constant::getIntegerValue(IntTy1, APInt(1, ClHistogram)), VarName);
560   Triple TT(M.getTargetTriple());
561   if (TT.supportsCOMDAT()) {
562     MemprofHistogramFlag->setLinkage(GlobalValue::ExternalLinkage);
563     MemprofHistogramFlag->setComdat(M.getOrInsertComdat(VarName));
564   }
565   appendToCompilerUsed(M, MemprofHistogramFlag);
566 }
567 
568 void createMemprofDefaultOptionsVar(Module &M) {
569   Constant *OptionsConst = ConstantDataArray::getString(
570       M.getContext(), MemprofRuntimeDefaultOptions, /*AddNull=*/true);
571   GlobalVariable *OptionsVar =
572       new GlobalVariable(M, OptionsConst->getType(), /*isConstant=*/true,
573                          GlobalValue::WeakAnyLinkage, OptionsConst,
574                          "__memprof_default_options_str");
575   Triple TT(M.getTargetTriple());
576   if (TT.supportsCOMDAT()) {
577     OptionsVar->setLinkage(GlobalValue::ExternalLinkage);
578     OptionsVar->setComdat(M.getOrInsertComdat(OptionsVar->getName()));
579   }
580 }
581 
582 bool ModuleMemProfiler::instrumentModule(Module &M) {
583 
584   // Create a module constructor.
585   std::string MemProfVersion = std::to_string(LLVM_MEM_PROFILER_VERSION);
586   std::string VersionCheckName =
587       ClInsertVersionCheck ? (MemProfVersionCheckNamePrefix + MemProfVersion)
588                            : "";
589   std::tie(MemProfCtorFunction, std::ignore) =
590       createSanitizerCtorAndInitFunctions(M, MemProfModuleCtorName,
591                                           MemProfInitName, /*InitArgTypes=*/{},
592                                           /*InitArgs=*/{}, VersionCheckName);
593 
594   const uint64_t Priority = getCtorAndDtorPriority(TargetTriple);
595   appendToGlobalCtors(M, MemProfCtorFunction, Priority);
596 
597   createProfileFileNameVar(M);
598 
599   createMemprofHistogramFlagVar(M);
600 
601   createMemprofDefaultOptionsVar(M);
602 
603   return true;
604 }
605 
606 void MemProfiler::initializeCallbacks(Module &M) {
607   IRBuilder<> IRB(*C);
608 
609   for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) {
610     const std::string TypeStr = AccessIsWrite ? "store" : "load";
611     const std::string HistPrefix = ClHistogram ? "hist_" : "";
612 
613     SmallVector<Type *, 2> Args1{1, IntptrTy};
614     MemProfMemoryAccessCallback[AccessIsWrite] = M.getOrInsertFunction(
615         ClMemoryAccessCallbackPrefix + HistPrefix + TypeStr,
616         FunctionType::get(IRB.getVoidTy(), Args1, false));
617   }
618   MemProfMemmove = M.getOrInsertFunction(
619       ClMemoryAccessCallbackPrefix + "memmove", PtrTy, PtrTy, PtrTy, IntptrTy);
620   MemProfMemcpy = M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + "memcpy",
621                                         PtrTy, PtrTy, PtrTy, IntptrTy);
622   MemProfMemset =
623       M.getOrInsertFunction(ClMemoryAccessCallbackPrefix + "memset", PtrTy,
624                             PtrTy, IRB.getInt32Ty(), IntptrTy);
625 }
626 
627 bool MemProfiler::maybeInsertMemProfInitAtFunctionEntry(Function &F) {
628   // For each NSObject descendant having a +load method, this method is invoked
629   // by the ObjC runtime before any of the static constructors is called.
630   // Therefore we need to instrument such methods with a call to __memprof_init
631   // at the beginning in order to initialize our runtime before any access to
632   // the shadow memory.
633   // We cannot just ignore these methods, because they may call other
634   // instrumented functions.
635   if (F.getName().contains(" load]")) {
636     FunctionCallee MemProfInitFunction =
637         declareSanitizerInitFunction(*F.getParent(), MemProfInitName, {});
638     IRBuilder<> IRB(&F.front(), F.front().begin());
639     IRB.CreateCall(MemProfInitFunction, {});
640     return true;
641   }
642   return false;
643 }
644 
645 bool MemProfiler::insertDynamicShadowAtFunctionEntry(Function &F) {
646   IRBuilder<> IRB(&F.front().front());
647   Value *GlobalDynamicAddress = F.getParent()->getOrInsertGlobal(
648       MemProfShadowMemoryDynamicAddress, IntptrTy);
649   if (F.getParent()->getPICLevel() == PICLevel::NotPIC)
650     cast<GlobalVariable>(GlobalDynamicAddress)->setDSOLocal(true);
651   DynamicShadowOffset = IRB.CreateLoad(IntptrTy, GlobalDynamicAddress);
652   return true;
653 }
654 
655 bool MemProfiler::instrumentFunction(Function &F) {
656   if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage)
657     return false;
658   if (ClDebugFunc == F.getName())
659     return false;
660   if (F.getName().starts_with("__memprof_"))
661     return false;
662 
663   bool FunctionModified = false;
664 
665   // If needed, insert __memprof_init.
666   // This function needs to be called even if the function body is not
667   // instrumented.
668   if (maybeInsertMemProfInitAtFunctionEntry(F))
669     FunctionModified = true;
670 
671   LLVM_DEBUG(dbgs() << "MEMPROF instrumenting:\n" << F << "\n");
672 
673   initializeCallbacks(*F.getParent());
674 
675   SmallVector<Instruction *, 16> ToInstrument;
676 
677   // Fill the set of memory operations to instrument.
678   for (auto &BB : F) {
679     for (auto &Inst : BB) {
680       if (isInterestingMemoryAccess(&Inst) || isa<MemIntrinsic>(Inst))
681         ToInstrument.push_back(&Inst);
682     }
683   }
684 
685   if (ToInstrument.empty()) {
686     LLVM_DEBUG(dbgs() << "MEMPROF done instrumenting: " << FunctionModified
687                       << " " << F << "\n");
688 
689     return FunctionModified;
690   }
691 
692   FunctionModified |= insertDynamicShadowAtFunctionEntry(F);
693 
694   int NumInstrumented = 0;
695   for (auto *Inst : ToInstrument) {
696     if (ClDebugMin < 0 || ClDebugMax < 0 ||
697         (NumInstrumented >= ClDebugMin && NumInstrumented <= ClDebugMax)) {
698       std::optional<InterestingMemoryAccess> Access =
699           isInterestingMemoryAccess(Inst);
700       if (Access)
701         instrumentMop(Inst, F.getDataLayout(), *Access);
702       else
703         instrumentMemIntrinsic(cast<MemIntrinsic>(Inst));
704     }
705     NumInstrumented++;
706   }
707 
708   if (NumInstrumented > 0)
709     FunctionModified = true;
710 
711   LLVM_DEBUG(dbgs() << "MEMPROF done instrumenting: " << FunctionModified << " "
712                     << F << "\n");
713 
714   return FunctionModified;
715 }
716 
717 static void addCallsiteMetadata(Instruction &I,
718                                 ArrayRef<uint64_t> InlinedCallStack,
719                                 LLVMContext &Ctx) {
720   I.setMetadata(LLVMContext::MD_callsite,
721                 buildCallstackMetadata(InlinedCallStack, Ctx));
722 }
723 
724 static uint64_t computeStackId(GlobalValue::GUID Function, uint32_t LineOffset,
725                                uint32_t Column) {
726   llvm::HashBuilder<llvm::TruncatedBLAKE3<8>, llvm::endianness::little>
727       HashBuilder;
728   HashBuilder.add(Function, LineOffset, Column);
729   llvm::BLAKE3Result<8> Hash = HashBuilder.final();
730   uint64_t Id;
731   std::memcpy(&Id, Hash.data(), sizeof(Hash));
732   return Id;
733 }
734 
735 static uint64_t computeStackId(const memprof::Frame &Frame) {
736   return computeStackId(Frame.Function, Frame.LineOffset, Frame.Column);
737 }
738 
739 // Helper to generate a single hash id for a given callstack, used for emitting
740 // matching statistics and useful for uniquing such statistics across modules.
741 static uint64_t computeFullStackId(ArrayRef<Frame> CallStack) {
742   llvm::HashBuilder<llvm::TruncatedBLAKE3<8>, llvm::endianness::little>
743       HashBuilder;
744   for (auto &F : CallStack)
745     HashBuilder.add(F.Function, F.LineOffset, F.Column);
746   llvm::BLAKE3Result<8> Hash = HashBuilder.final();
747   uint64_t Id;
748   std::memcpy(&Id, Hash.data(), sizeof(Hash));
749   return Id;
750 }
751 
752 static AllocationType addCallStack(CallStackTrie &AllocTrie,
753                                    const AllocationInfo *AllocInfo,
754                                    uint64_t FullStackId) {
755   SmallVector<uint64_t> StackIds;
756   for (const auto &StackFrame : AllocInfo->CallStack)
757     StackIds.push_back(computeStackId(StackFrame));
758   auto AllocType = getAllocType(AllocInfo->Info.getTotalLifetimeAccessDensity(),
759                                 AllocInfo->Info.getAllocCount(),
760                                 AllocInfo->Info.getTotalLifetime());
761   std::vector<ContextTotalSize> ContextSizeInfo;
762   if (MemProfReportHintedSizes || MinClonedColdBytePercent < 100) {
763     auto TotalSize = AllocInfo->Info.getTotalSize();
764     assert(TotalSize);
765     assert(FullStackId != 0);
766     ContextSizeInfo.push_back({FullStackId, TotalSize});
767   }
768   AllocTrie.addCallStack(AllocType, StackIds, std::move(ContextSizeInfo));
769   return AllocType;
770 }
771 
772 // Helper to compare the InlinedCallStack computed from an instruction's debug
773 // info to a list of Frames from profile data (either the allocation data or a
774 // callsite). For callsites, the StartIndex to use in the Frame array may be
775 // non-zero.
776 static bool
777 stackFrameIncludesInlinedCallStack(ArrayRef<Frame> ProfileCallStack,
778                                    ArrayRef<uint64_t> InlinedCallStack) {
779   auto StackFrame = ProfileCallStack.begin();
780   auto InlCallStackIter = InlinedCallStack.begin();
781   for (; StackFrame != ProfileCallStack.end() &&
782          InlCallStackIter != InlinedCallStack.end();
783        ++StackFrame, ++InlCallStackIter) {
784     uint64_t StackId = computeStackId(*StackFrame);
785     if (StackId != *InlCallStackIter)
786       return false;
787   }
788   // Return true if we found and matched all stack ids from the call
789   // instruction.
790   return InlCallStackIter == InlinedCallStack.end();
791 }
792 
793 static bool isAllocationWithHotColdVariant(const Function *Callee,
794                                            const TargetLibraryInfo &TLI) {
795   if (!Callee)
796     return false;
797   LibFunc Func;
798   if (!TLI.getLibFunc(*Callee, Func))
799     return false;
800   switch (Func) {
801   case LibFunc_Znwm:
802   case LibFunc_ZnwmRKSt9nothrow_t:
803   case LibFunc_ZnwmSt11align_val_t:
804   case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t:
805   case LibFunc_Znam:
806   case LibFunc_ZnamRKSt9nothrow_t:
807   case LibFunc_ZnamSt11align_val_t:
808   case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t:
809   case LibFunc_size_returning_new:
810   case LibFunc_size_returning_new_aligned:
811     return true;
812   case LibFunc_Znwm12__hot_cold_t:
813   case LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t:
814   case LibFunc_ZnwmSt11align_val_t12__hot_cold_t:
815   case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t:
816   case LibFunc_Znam12__hot_cold_t:
817   case LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t:
818   case LibFunc_ZnamSt11align_val_t12__hot_cold_t:
819   case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t:
820   case LibFunc_size_returning_new_hot_cold:
821   case LibFunc_size_returning_new_aligned_hot_cold:
822     return ClMemProfMatchHotColdNew;
823   default:
824     return false;
825   }
826 }
827 
828 struct AllocMatchInfo {
829   uint64_t TotalSize = 0;
830   AllocationType AllocType = AllocationType::None;
831   bool Matched = false;
832 };
833 
834 DenseMap<uint64_t, SmallVector<CallEdgeTy, 0>>
835 memprof::extractCallsFromIR(Module &M, const TargetLibraryInfo &TLI) {
836   DenseMap<uint64_t, SmallVector<CallEdgeTy, 0>> Calls;
837 
838   auto GetOffset = [](const DILocation *DIL) {
839     return (DIL->getLine() - DIL->getScope()->getSubprogram()->getLine()) &
840            0xffff;
841   };
842 
843   for (Function &F : M) {
844     if (F.isDeclaration())
845       continue;
846 
847     for (auto &BB : F) {
848       for (auto &I : BB) {
849         if (!isa<CallBase>(&I) || isa<IntrinsicInst>(&I))
850           continue;
851 
852         auto *CB = dyn_cast<CallBase>(&I);
853         auto *CalledFunction = CB->getCalledFunction();
854         // Disregard indirect calls and intrinsics.
855         if (!CalledFunction || CalledFunction->isIntrinsic())
856           continue;
857 
858         StringRef CalleeName = CalledFunction->getName();
859         bool IsAlloc = isAllocationWithHotColdVariant(CalledFunction, TLI);
860         for (const DILocation *DIL = I.getDebugLoc(); DIL;
861              DIL = DIL->getInlinedAt()) {
862           StringRef CallerName = DIL->getSubprogramLinkageName();
863           assert(!CallerName.empty() &&
864                  "Be sure to enable -fdebug-info-for-profiling");
865           uint64_t CallerGUID = IndexedMemProfRecord::getGUID(CallerName);
866           uint64_t CalleeGUID = IndexedMemProfRecord::getGUID(CalleeName);
867           // Pretend that we are calling a function with GUID == 0 if we are
868           // calling a heap allocation function.
869           if (IsAlloc)
870             CalleeGUID = 0;
871           LineLocation Loc = {GetOffset(DIL), DIL->getColumn()};
872           Calls[CallerGUID].emplace_back(Loc, CalleeGUID);
873           CalleeName = CallerName;
874           // FIXME: Recognize other frames that are associated with heap
875           // allocation functions.  It may be too early to reset IsAlloc to
876           // false here.
877           IsAlloc = false;
878         }
879       }
880     }
881   }
882 
883   // Sort each call list by the source location.
884   for (auto &[CallerGUID, CallList] : Calls) {
885     llvm::sort(CallList);
886     CallList.erase(llvm::unique(CallList), CallList.end());
887   }
888 
889   return Calls;
890 }
891 
892 DenseMap<uint64_t, LocToLocMap>
893 memprof::computeUndriftMap(Module &M, IndexedInstrProfReader *MemProfReader,
894                            const TargetLibraryInfo &TLI) {
895   DenseMap<uint64_t, LocToLocMap> UndriftMaps;
896 
897   DenseMap<uint64_t, SmallVector<memprof::CallEdgeTy, 0>> CallsFromProfile =
898       MemProfReader->getMemProfCallerCalleePairs();
899   DenseMap<uint64_t, SmallVector<memprof::CallEdgeTy, 0>> CallsFromIR =
900       extractCallsFromIR(M, TLI);
901 
902   // Compute an undrift map for each CallerGUID.
903   for (const auto &[CallerGUID, IRAnchors] : CallsFromIR) {
904     auto It = CallsFromProfile.find(CallerGUID);
905     if (It == CallsFromProfile.end())
906       continue;
907     const auto &ProfileAnchors = It->second;
908 
909     LocToLocMap Matchings;
910     longestCommonSequence<LineLocation, GlobalValue::GUID>(
911         ProfileAnchors, IRAnchors, std::equal_to<GlobalValue::GUID>(),
912         [&](LineLocation A, LineLocation B) { Matchings.try_emplace(A, B); });
913     bool Inserted = UndriftMaps.try_emplace(CallerGUID, Matchings).second;
914 
915     // The insertion must succeed because we visit each GUID exactly once.
916     assert(Inserted);
917     (void)Inserted;
918   }
919 
920   return UndriftMaps;
921 }
922 
923 // Given a MemProfRecord, undrift all the source locations present in the
924 // record in place.
925 static void
926 undriftMemProfRecord(const DenseMap<uint64_t, LocToLocMap> &UndriftMaps,
927                      memprof::MemProfRecord &MemProfRec) {
928   // Undrift a call stack in place.
929   auto UndriftCallStack = [&](std::vector<Frame> &CallStack) {
930     for (auto &F : CallStack) {
931       auto I = UndriftMaps.find(F.Function);
932       if (I == UndriftMaps.end())
933         continue;
934       auto J = I->second.find(LineLocation(F.LineOffset, F.Column));
935       if (J == I->second.end())
936         continue;
937       auto &NewLoc = J->second;
938       F.LineOffset = NewLoc.LineOffset;
939       F.Column = NewLoc.Column;
940     }
941   };
942 
943   for (auto &AS : MemProfRec.AllocSites)
944     UndriftCallStack(AS.CallStack);
945 
946   for (auto &CS : MemProfRec.CallSites)
947     UndriftCallStack(CS);
948 }
949 
950 static void
951 readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
952             const TargetLibraryInfo &TLI,
953             std::map<uint64_t, AllocMatchInfo> &FullStackIdToAllocMatchInfo,
954             DenseMap<uint64_t, LocToLocMap> &UndriftMaps) {
955   auto &Ctx = M.getContext();
956   // Previously we used getIRPGOFuncName() here. If F is local linkage,
957   // getIRPGOFuncName() returns FuncName with prefix 'FileName;'. But
958   // llvm-profdata uses FuncName in dwarf to create GUID which doesn't
959   // contain FileName's prefix. It caused local linkage function can't
960   // find MemProfRecord. So we use getName() now.
961   // 'unique-internal-linkage-names' can make MemProf work better for local
962   // linkage function.
963   auto FuncName = F.getName();
964   auto FuncGUID = Function::getGUID(FuncName);
965   std::optional<memprof::MemProfRecord> MemProfRec;
966   auto Err = MemProfReader->getMemProfRecord(FuncGUID).moveInto(MemProfRec);
967   if (Err) {
968     handleAllErrors(std::move(Err), [&](const InstrProfError &IPE) {
969       auto Err = IPE.get();
970       bool SkipWarning = false;
971       LLVM_DEBUG(dbgs() << "Error in reading profile for Func " << FuncName
972                         << ": ");
973       if (Err == instrprof_error::unknown_function) {
974         NumOfMemProfMissing++;
975         SkipWarning = !PGOWarnMissing;
976         LLVM_DEBUG(dbgs() << "unknown function");
977       } else if (Err == instrprof_error::hash_mismatch) {
978         NumOfMemProfMismatch++;
979         SkipWarning =
980             NoPGOWarnMismatch ||
981             (NoPGOWarnMismatchComdatWeak &&
982              (F.hasComdat() ||
983               F.getLinkage() == GlobalValue::AvailableExternallyLinkage));
984         LLVM_DEBUG(dbgs() << "hash mismatch (skip=" << SkipWarning << ")");
985       }
986 
987       if (SkipWarning)
988         return;
989 
990       std::string Msg = (IPE.message() + Twine(" ") + F.getName().str() +
991                          Twine(" Hash = ") + std::to_string(FuncGUID))
992                             .str();
993 
994       Ctx.diagnose(
995           DiagnosticInfoPGOProfile(M.getName().data(), Msg, DS_Warning));
996     });
997     return;
998   }
999 
1000   NumOfMemProfFunc++;
1001 
1002   // If requested, undrfit MemProfRecord so that the source locations in it
1003   // match those in the IR.
1004   if (SalvageStaleProfile)
1005     undriftMemProfRecord(UndriftMaps, *MemProfRec);
1006 
1007   // Detect if there are non-zero column numbers in the profile. If not,
1008   // treat all column numbers as 0 when matching (i.e. ignore any non-zero
1009   // columns in the IR). The profiled binary might have been built with
1010   // column numbers disabled, for example.
1011   bool ProfileHasColumns = false;
1012 
1013   // Build maps of the location hash to all profile data with that leaf location
1014   // (allocation info and the callsites).
1015   std::map<uint64_t, std::set<const AllocationInfo *>> LocHashToAllocInfo;
1016   // A hash function for std::unordered_set<ArrayRef<Frame>> to work.
1017   struct CallStackHash {
1018     size_t operator()(ArrayRef<Frame> CS) const {
1019       return computeFullStackId(CS);
1020     }
1021   };
1022   // For the callsites we need to record slices of the frame array (see comments
1023   // below where the map entries are added).
1024   std::map<uint64_t, std::unordered_set<ArrayRef<Frame>, CallStackHash>>
1025       LocHashToCallSites;
1026   for (auto &AI : MemProfRec->AllocSites) {
1027     NumOfMemProfAllocContextProfiles++;
1028     // Associate the allocation info with the leaf frame. The later matching
1029     // code will match any inlined call sequences in the IR with a longer prefix
1030     // of call stack frames.
1031     uint64_t StackId = computeStackId(AI.CallStack[0]);
1032     LocHashToAllocInfo[StackId].insert(&AI);
1033     ProfileHasColumns |= AI.CallStack[0].Column;
1034   }
1035   for (auto &CS : MemProfRec->CallSites) {
1036     NumOfMemProfCallSiteProfiles++;
1037     // Need to record all frames from leaf up to and including this function,
1038     // as any of these may or may not have been inlined at this point.
1039     unsigned Idx = 0;
1040     for (auto &StackFrame : CS) {
1041       uint64_t StackId = computeStackId(StackFrame);
1042       LocHashToCallSites[StackId].insert(ArrayRef<Frame>(CS).drop_front(Idx++));
1043       ProfileHasColumns |= StackFrame.Column;
1044       // Once we find this function, we can stop recording.
1045       if (StackFrame.Function == FuncGUID)
1046         break;
1047     }
1048     assert(Idx <= CS.size() && CS[Idx - 1].Function == FuncGUID);
1049   }
1050 
1051   auto GetOffset = [](const DILocation *DIL) {
1052     return (DIL->getLine() - DIL->getScope()->getSubprogram()->getLine()) &
1053            0xffff;
1054   };
1055 
1056   // Now walk the instructions, looking up the associated profile data using
1057   // debug locations.
1058   for (auto &BB : F) {
1059     for (auto &I : BB) {
1060       if (I.isDebugOrPseudoInst())
1061         continue;
1062       // We are only interested in calls (allocation or interior call stack
1063       // context calls).
1064       auto *CI = dyn_cast<CallBase>(&I);
1065       if (!CI)
1066         continue;
1067       auto *CalledFunction = CI->getCalledFunction();
1068       if (CalledFunction && CalledFunction->isIntrinsic())
1069         continue;
1070       // List of call stack ids computed from the location hashes on debug
1071       // locations (leaf to inlined at root).
1072       SmallVector<uint64_t, 8> InlinedCallStack;
1073       // Was the leaf location found in one of the profile maps?
1074       bool LeafFound = false;
1075       // If leaf was found in a map, iterators pointing to its location in both
1076       // of the maps. It might exist in neither, one, or both (the latter case
1077       // can happen because we don't currently have discriminators to
1078       // distinguish the case when a single line/col maps to both an allocation
1079       // and another callsite).
1080       std::map<uint64_t, std::set<const AllocationInfo *>>::iterator
1081           AllocInfoIter;
1082       decltype(LocHashToCallSites)::iterator CallSitesIter;
1083       for (const DILocation *DIL = I.getDebugLoc(); DIL != nullptr;
1084            DIL = DIL->getInlinedAt()) {
1085         // Use C++ linkage name if possible. Need to compile with
1086         // -fdebug-info-for-profiling to get linkage name.
1087         StringRef Name = DIL->getScope()->getSubprogram()->getLinkageName();
1088         if (Name.empty())
1089           Name = DIL->getScope()->getSubprogram()->getName();
1090         auto CalleeGUID = Function::getGUID(Name);
1091         auto StackId = computeStackId(CalleeGUID, GetOffset(DIL),
1092                                       ProfileHasColumns ? DIL->getColumn() : 0);
1093         // Check if we have found the profile's leaf frame. If yes, collect
1094         // the rest of the call's inlined context starting here. If not, see if
1095         // we find a match further up the inlined context (in case the profile
1096         // was missing debug frames at the leaf).
1097         if (!LeafFound) {
1098           AllocInfoIter = LocHashToAllocInfo.find(StackId);
1099           CallSitesIter = LocHashToCallSites.find(StackId);
1100           if (AllocInfoIter != LocHashToAllocInfo.end() ||
1101               CallSitesIter != LocHashToCallSites.end())
1102             LeafFound = true;
1103         }
1104         if (LeafFound)
1105           InlinedCallStack.push_back(StackId);
1106       }
1107       // If leaf not in either of the maps, skip inst.
1108       if (!LeafFound)
1109         continue;
1110 
1111       // First add !memprof metadata from allocation info, if we found the
1112       // instruction's leaf location in that map, and if the rest of the
1113       // instruction's locations match the prefix Frame locations on an
1114       // allocation context with the same leaf.
1115       if (AllocInfoIter != LocHashToAllocInfo.end()) {
1116         // Only consider allocations which support hinting.
1117         if (!isAllocationWithHotColdVariant(CI->getCalledFunction(), TLI))
1118           continue;
1119         // We may match this instruction's location list to multiple MIB
1120         // contexts. Add them to a Trie specialized for trimming the contexts to
1121         // the minimal needed to disambiguate contexts with unique behavior.
1122         CallStackTrie AllocTrie;
1123         uint64_t TotalSize = 0;
1124         uint64_t TotalColdSize = 0;
1125         for (auto *AllocInfo : AllocInfoIter->second) {
1126           // Check the full inlined call stack against this one.
1127           // If we found and thus matched all frames on the call, include
1128           // this MIB.
1129           if (stackFrameIncludesInlinedCallStack(AllocInfo->CallStack,
1130                                                  InlinedCallStack)) {
1131             NumOfMemProfMatchedAllocContexts++;
1132             uint64_t FullStackId = 0;
1133             if (ClPrintMemProfMatchInfo || MemProfReportHintedSizes ||
1134                 MinClonedColdBytePercent < 100)
1135               FullStackId = computeFullStackId(AllocInfo->CallStack);
1136             auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId);
1137             TotalSize += AllocInfo->Info.getTotalSize();
1138             if (AllocType == AllocationType::Cold)
1139               TotalColdSize += AllocInfo->Info.getTotalSize();
1140             // Record information about the allocation if match info printing
1141             // was requested.
1142             if (ClPrintMemProfMatchInfo) {
1143               assert(FullStackId != 0);
1144               FullStackIdToAllocMatchInfo[FullStackId] = {
1145                   AllocInfo->Info.getTotalSize(), AllocType, /*Matched=*/true};
1146             }
1147           }
1148         }
1149         // If the threshold for the percent of cold bytes is less than 100%,
1150         // and not all bytes are cold, see if we should still hint this
1151         // allocation as cold without context sensitivity.
1152         if (TotalColdSize < TotalSize && MinMatchedColdBytePercent < 100 &&
1153             TotalColdSize * 100 >= MinMatchedColdBytePercent * TotalSize) {
1154           AllocTrie.addSingleAllocTypeAttribute(CI, AllocationType::Cold,
1155                                                 "dominant");
1156           continue;
1157         }
1158 
1159         // We might not have matched any to the full inlined call stack.
1160         // But if we did, create and attach metadata, or a function attribute if
1161         // all contexts have identical profiled behavior.
1162         if (!AllocTrie.empty()) {
1163           NumOfMemProfMatchedAllocs++;
1164           // MemprofMDAttached will be false if a function attribute was
1165           // attached.
1166           bool MemprofMDAttached = AllocTrie.buildAndAttachMIBMetadata(CI);
1167           assert(MemprofMDAttached == I.hasMetadata(LLVMContext::MD_memprof));
1168           if (MemprofMDAttached) {
1169             // Add callsite metadata for the instruction's location list so that
1170             // it simpler later on to identify which part of the MIB contexts
1171             // are from this particular instruction (including during inlining,
1172             // when the callsite metadata will be updated appropriately).
1173             // FIXME: can this be changed to strip out the matching stack
1174             // context ids from the MIB contexts and not add any callsite
1175             // metadata here to save space?
1176             addCallsiteMetadata(I, InlinedCallStack, Ctx);
1177           }
1178         }
1179         continue;
1180       }
1181 
1182       // Otherwise, add callsite metadata. If we reach here then we found the
1183       // instruction's leaf location in the callsites map and not the allocation
1184       // map.
1185       assert(CallSitesIter != LocHashToCallSites.end());
1186       for (auto CallStackIdx : CallSitesIter->second) {
1187         // If we found and thus matched all frames on the call, create and
1188         // attach call stack metadata.
1189         if (stackFrameIncludesInlinedCallStack(CallStackIdx,
1190                                                InlinedCallStack)) {
1191           NumOfMemProfMatchedCallSites++;
1192           addCallsiteMetadata(I, InlinedCallStack, Ctx);
1193           // Only need to find one with a matching call stack and add a single
1194           // callsite metadata.
1195           break;
1196         }
1197       }
1198     }
1199   }
1200 }
1201 
1202 MemProfUsePass::MemProfUsePass(std::string MemoryProfileFile,
1203                                IntrusiveRefCntPtr<vfs::FileSystem> FS)
1204     : MemoryProfileFileName(MemoryProfileFile), FS(FS) {
1205   if (!FS)
1206     this->FS = vfs::getRealFileSystem();
1207 }
1208 
1209 PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) {
1210   // Return immediately if the module doesn't contain any function.
1211   if (M.empty())
1212     return PreservedAnalyses::all();
1213 
1214   LLVM_DEBUG(dbgs() << "Read in memory profile:");
1215   auto &Ctx = M.getContext();
1216   auto ReaderOrErr = IndexedInstrProfReader::create(MemoryProfileFileName, *FS);
1217   if (Error E = ReaderOrErr.takeError()) {
1218     handleAllErrors(std::move(E), [&](const ErrorInfoBase &EI) {
1219       Ctx.diagnose(
1220           DiagnosticInfoPGOProfile(MemoryProfileFileName.data(), EI.message()));
1221     });
1222     return PreservedAnalyses::all();
1223   }
1224 
1225   std::unique_ptr<IndexedInstrProfReader> MemProfReader =
1226       std::move(ReaderOrErr.get());
1227   if (!MemProfReader) {
1228     Ctx.diagnose(DiagnosticInfoPGOProfile(
1229         MemoryProfileFileName.data(), StringRef("Cannot get MemProfReader")));
1230     return PreservedAnalyses::all();
1231   }
1232 
1233   if (!MemProfReader->hasMemoryProfile()) {
1234     Ctx.diagnose(DiagnosticInfoPGOProfile(MemoryProfileFileName.data(),
1235                                           "Not a memory profile"));
1236     return PreservedAnalyses::all();
1237   }
1238 
1239   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1240 
1241   TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(*M.begin());
1242   DenseMap<uint64_t, LocToLocMap> UndriftMaps;
1243   if (SalvageStaleProfile)
1244     UndriftMaps = computeUndriftMap(M, MemProfReader.get(), TLI);
1245 
1246   // Map from the stack has of each allocation context in the function profiles
1247   // to the total profiled size (bytes), allocation type, and whether we matched
1248   // it to an allocation in the IR.
1249   std::map<uint64_t, AllocMatchInfo> FullStackIdToAllocMatchInfo;
1250 
1251   for (auto &F : M) {
1252     if (F.isDeclaration())
1253       continue;
1254 
1255     const TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
1256     readMemprof(M, F, MemProfReader.get(), TLI, FullStackIdToAllocMatchInfo,
1257                 UndriftMaps);
1258   }
1259 
1260   if (ClPrintMemProfMatchInfo) {
1261     for (const auto &[Id, Info] : FullStackIdToAllocMatchInfo)
1262       errs() << "MemProf " << getAllocTypeAttributeString(Info.AllocType)
1263              << " context with id " << Id << " has total profiled size "
1264              << Info.TotalSize << (Info.Matched ? " is" : " not")
1265              << " matched\n";
1266   }
1267 
1268   return PreservedAnalyses::none();
1269 }
1270