xref: /llvm-project/llvm/lib/ProfileData/MemProf.cpp (revision 6fb967ec9e13216ee1b4fc15764e0b3df9e5683f)
1 #include "llvm/ProfileData/MemProf.h"
2 #include "llvm/ADT/SmallVector.h"
3 #include "llvm/IR/Function.h"
4 #include "llvm/ProfileData/InstrProf.h"
5 #include "llvm/ProfileData/SampleProf.h"
6 #include "llvm/Support/BLAKE3.h"
7 #include "llvm/Support/Endian.h"
8 #include "llvm/Support/EndianStream.h"
9 #include "llvm/Support/HashBuilder.h"
10 
11 namespace llvm {
12 namespace memprof {
13 MemProfSchema getFullSchema() {
14   MemProfSchema List;
15 #define MIBEntryDef(NameTag, Name, Type) List.push_back(Meta::Name);
16 #include "llvm/ProfileData/MIBEntryDef.inc"
17 #undef MIBEntryDef
18   return List;
19 }
20 
21 MemProfSchema getHotColdSchema() {
22   return {Meta::AllocCount, Meta::TotalSize, Meta::TotalLifetime,
23           Meta::TotalLifetimeAccessDensity};
24 }
25 
26 static size_t serializedSizeV2(const IndexedAllocationInfo &IAI,
27                                const MemProfSchema &Schema) {
28   size_t Size = 0;
29   // The CallStackId
30   Size += sizeof(CallStackId);
31   // The size of the payload.
32   Size += PortableMemInfoBlock::serializedSize(Schema);
33   return Size;
34 }
35 
36 static size_t serializedSizeV3(const IndexedAllocationInfo &IAI,
37                                const MemProfSchema &Schema) {
38   size_t Size = 0;
39   // The linear call stack ID.
40   Size += sizeof(LinearCallStackId);
41   // The size of the payload.
42   Size += PortableMemInfoBlock::serializedSize(Schema);
43   return Size;
44 }
45 
46 size_t IndexedAllocationInfo::serializedSize(const MemProfSchema &Schema,
47                                              IndexedVersion Version) const {
48   switch (Version) {
49   case Version2:
50     return serializedSizeV2(*this, Schema);
51   case Version3:
52     return serializedSizeV3(*this, Schema);
53   }
54   llvm_unreachable("unsupported MemProf version");
55 }
56 
57 static size_t serializedSizeV2(const IndexedMemProfRecord &Record,
58                                const MemProfSchema &Schema) {
59   // The number of alloc sites to serialize.
60   size_t Result = sizeof(uint64_t);
61   for (const IndexedAllocationInfo &N : Record.AllocSites)
62     Result += N.serializedSize(Schema, Version2);
63 
64   // The number of callsites we have information for.
65   Result += sizeof(uint64_t);
66   // The CallStackId
67   Result += Record.CallSiteIds.size() * sizeof(CallStackId);
68   return Result;
69 }
70 
71 static size_t serializedSizeV3(const IndexedMemProfRecord &Record,
72                                const MemProfSchema &Schema) {
73   // The number of alloc sites to serialize.
74   size_t Result = sizeof(uint64_t);
75   for (const IndexedAllocationInfo &N : Record.AllocSites)
76     Result += N.serializedSize(Schema, Version3);
77 
78   // The number of callsites we have information for.
79   Result += sizeof(uint64_t);
80   // The linear call stack ID.
81   Result += Record.CallSiteIds.size() * sizeof(LinearCallStackId);
82   return Result;
83 }
84 
85 size_t IndexedMemProfRecord::serializedSize(const MemProfSchema &Schema,
86                                             IndexedVersion Version) const {
87   switch (Version) {
88   case Version2:
89     return serializedSizeV2(*this, Schema);
90   case Version3:
91     return serializedSizeV3(*this, Schema);
92   }
93   llvm_unreachable("unsupported MemProf version");
94 }
95 
96 static void serializeV2(const IndexedMemProfRecord &Record,
97                         const MemProfSchema &Schema, raw_ostream &OS) {
98   using namespace support;
99 
100   endian::Writer LE(OS, llvm::endianness::little);
101 
102   LE.write<uint64_t>(Record.AllocSites.size());
103   for (const IndexedAllocationInfo &N : Record.AllocSites) {
104     LE.write<CallStackId>(N.CSId);
105     N.Info.serialize(Schema, OS);
106   }
107 
108   // Related contexts.
109   LE.write<uint64_t>(Record.CallSiteIds.size());
110   for (const auto &CSId : Record.CallSiteIds)
111     LE.write<CallStackId>(CSId);
112 }
113 
114 static void serializeV3(
115     const IndexedMemProfRecord &Record, const MemProfSchema &Schema,
116     raw_ostream &OS,
117     llvm::DenseMap<CallStackId, LinearCallStackId> &MemProfCallStackIndexes) {
118   using namespace support;
119 
120   endian::Writer LE(OS, llvm::endianness::little);
121 
122   LE.write<uint64_t>(Record.AllocSites.size());
123   for (const IndexedAllocationInfo &N : Record.AllocSites) {
124     assert(MemProfCallStackIndexes.contains(N.CSId));
125     LE.write<LinearCallStackId>(MemProfCallStackIndexes[N.CSId]);
126     N.Info.serialize(Schema, OS);
127   }
128 
129   // Related contexts.
130   LE.write<uint64_t>(Record.CallSiteIds.size());
131   for (const auto &CSId : Record.CallSiteIds) {
132     assert(MemProfCallStackIndexes.contains(CSId));
133     LE.write<LinearCallStackId>(MemProfCallStackIndexes[CSId]);
134   }
135 }
136 
137 void IndexedMemProfRecord::serialize(
138     const MemProfSchema &Schema, raw_ostream &OS, IndexedVersion Version,
139     llvm::DenseMap<CallStackId, LinearCallStackId> *MemProfCallStackIndexes)
140     const {
141   switch (Version) {
142   case Version2:
143     serializeV2(*this, Schema, OS);
144     return;
145   case Version3:
146     serializeV3(*this, Schema, OS, *MemProfCallStackIndexes);
147     return;
148   }
149   llvm_unreachable("unsupported MemProf version");
150 }
151 
152 static IndexedMemProfRecord deserializeV2(const MemProfSchema &Schema,
153                                           const unsigned char *Ptr) {
154   using namespace support;
155 
156   IndexedMemProfRecord Record;
157 
158   // Read the meminfo nodes.
159   const uint64_t NumNodes =
160       endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
161   Record.AllocSites.reserve(NumNodes);
162   for (uint64_t I = 0; I < NumNodes; I++) {
163     IndexedAllocationInfo Node;
164     Node.CSId = endian::readNext<CallStackId, llvm::endianness::little>(Ptr);
165     Node.Info.deserialize(Schema, Ptr);
166     Ptr += PortableMemInfoBlock::serializedSize(Schema);
167     Record.AllocSites.push_back(Node);
168   }
169 
170   // Read the callsite information.
171   const uint64_t NumCtxs =
172       endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
173   Record.CallSiteIds.reserve(NumCtxs);
174   for (uint64_t J = 0; J < NumCtxs; J++) {
175     CallStackId CSId =
176         endian::readNext<CallStackId, llvm::endianness::little>(Ptr);
177     Record.CallSiteIds.push_back(CSId);
178   }
179 
180   return Record;
181 }
182 
183 static IndexedMemProfRecord deserializeV3(const MemProfSchema &Schema,
184                                           const unsigned char *Ptr) {
185   using namespace support;
186 
187   IndexedMemProfRecord Record;
188 
189   // Read the meminfo nodes.
190   const uint64_t NumNodes =
191       endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
192   Record.AllocSites.reserve(NumNodes);
193   for (uint64_t I = 0; I < NumNodes; I++) {
194     IndexedAllocationInfo Node;
195     Node.CSId =
196         endian::readNext<LinearCallStackId, llvm::endianness::little>(Ptr);
197     Node.Info.deserialize(Schema, Ptr);
198     Ptr += PortableMemInfoBlock::serializedSize(Schema);
199     Record.AllocSites.push_back(Node);
200   }
201 
202   // Read the callsite information.
203   const uint64_t NumCtxs =
204       endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
205   Record.CallSiteIds.reserve(NumCtxs);
206   for (uint64_t J = 0; J < NumCtxs; J++) {
207     // We are storing LinearCallStackId in CallSiteIds, which is a vector of
208     // CallStackId.  Assert that CallStackId is no smaller than
209     // LinearCallStackId.
210     static_assert(sizeof(LinearCallStackId) <= sizeof(CallStackId));
211     LinearCallStackId CSId =
212         endian::readNext<LinearCallStackId, llvm::endianness::little>(Ptr);
213     Record.CallSiteIds.push_back(CSId);
214   }
215 
216   return Record;
217 }
218 
219 IndexedMemProfRecord
220 IndexedMemProfRecord::deserialize(const MemProfSchema &Schema,
221                                   const unsigned char *Ptr,
222                                   IndexedVersion Version) {
223   switch (Version) {
224   case Version2:
225     return deserializeV2(Schema, Ptr);
226   case Version3:
227     return deserializeV3(Schema, Ptr);
228   }
229   llvm_unreachable("unsupported MemProf version");
230 }
231 
232 MemProfRecord IndexedMemProfRecord::toMemProfRecord(
233     llvm::function_ref<std::vector<Frame>(const CallStackId)> Callback) const {
234   MemProfRecord Record;
235 
236   Record.AllocSites.reserve(AllocSites.size());
237   for (const IndexedAllocationInfo &IndexedAI : AllocSites) {
238     AllocationInfo AI;
239     AI.Info = IndexedAI.Info;
240     AI.CallStack = Callback(IndexedAI.CSId);
241     Record.AllocSites.push_back(std::move(AI));
242   }
243 
244   Record.CallSites.reserve(CallSiteIds.size());
245   for (CallStackId CSId : CallSiteIds)
246     Record.CallSites.push_back(Callback(CSId));
247 
248   return Record;
249 }
250 
251 GlobalValue::GUID IndexedMemProfRecord::getGUID(const StringRef FunctionName) {
252   // Canonicalize the function name to drop suffixes such as ".llvm.". Note
253   // we do not drop any ".__uniq." suffixes, as getCanonicalFnName does not drop
254   // those by default. This is by design to differentiate internal linkage
255   // functions during matching. By dropping the other suffixes we can then match
256   // functions in the profile use phase prior to their addition. Note that this
257   // applies to both instrumented and sampled function names.
258   StringRef CanonicalName =
259       sampleprof::FunctionSamples::getCanonicalFnName(FunctionName);
260 
261   // We use the function guid which we expect to be a uint64_t. At
262   // this time, it is the lower 64 bits of the md5 of the canonical
263   // function name.
264   return Function::getGUID(CanonicalName);
265 }
266 
267 Expected<MemProfSchema> readMemProfSchema(const unsigned char *&Buffer) {
268   using namespace support;
269 
270   const unsigned char *Ptr = Buffer;
271   const uint64_t NumSchemaIds =
272       endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
273   if (NumSchemaIds > static_cast<uint64_t>(Meta::Size)) {
274     return make_error<InstrProfError>(instrprof_error::malformed,
275                                       "memprof schema invalid");
276   }
277 
278   MemProfSchema Result;
279   for (size_t I = 0; I < NumSchemaIds; I++) {
280     const uint64_t Tag =
281         endian::readNext<uint64_t, llvm::endianness::little>(Ptr);
282     if (Tag >= static_cast<uint64_t>(Meta::Size)) {
283       return make_error<InstrProfError>(instrprof_error::malformed,
284                                         "memprof schema invalid");
285     }
286     Result.push_back(static_cast<Meta>(Tag));
287   }
288   // Advance the buffer to one past the schema if we succeeded.
289   Buffer = Ptr;
290   return Result;
291 }
292 
293 CallStackId IndexedMemProfData::hashCallStack(ArrayRef<FrameId> CS) const {
294   llvm::HashBuilder<llvm::TruncatedBLAKE3<8>, llvm::endianness::little>
295       HashBuilder;
296   for (FrameId F : CS)
297     HashBuilder.add(F);
298   llvm::BLAKE3Result<8> Hash = HashBuilder.final();
299   CallStackId CSId;
300   std::memcpy(&CSId, Hash.data(), sizeof(Hash));
301   return CSId;
302 }
303 
304 // Encode a call stack into RadixArray.  Return the starting index within
305 // RadixArray.  For each call stack we encode, we emit two or three components
306 // into RadixArray.  If a given call stack doesn't have a common prefix relative
307 // to the previous one, we emit:
308 //
309 // - the frames in the given call stack in the root-to-leaf order
310 //
311 // - the length of the given call stack
312 //
313 // If a given call stack has a non-empty common prefix relative to the previous
314 // one, we emit:
315 //
316 // - the relative location of the common prefix, encoded as a negative number.
317 //
318 // - a portion of the given call stack that's beyond the common prefix
319 //
320 // - the length of the given call stack, including the length of the common
321 //   prefix.
322 //
323 // The resulting RadixArray requires a somewhat unintuitive backward traversal
324 // to reconstruct a call stack -- read the call stack length and scan backward
325 // while collecting frames in the leaf to root order.  build, the caller of this
326 // function, reverses RadixArray in place so that we can reconstruct a call
327 // stack as if we were deserializing an array in a typical way -- the call stack
328 // length followed by the frames in the leaf-to-root order except that we need
329 // to handle pointers to parents along the way.
330 //
331 // To quickly determine the location of the common prefix within RadixArray,
332 // Indexes caches the indexes of the previous call stack's frames within
333 // RadixArray.
334 template <typename FrameIdTy>
335 LinearCallStackId CallStackRadixTreeBuilder<FrameIdTy>::encodeCallStack(
336     const llvm::SmallVector<FrameIdTy> *CallStack,
337     const llvm::SmallVector<FrameIdTy> *Prev,
338     const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes) {
339   // Compute the length of the common root prefix between Prev and CallStack.
340   uint32_t CommonLen = 0;
341   if (Prev) {
342     auto Pos = std::mismatch(Prev->rbegin(), Prev->rend(), CallStack->rbegin(),
343                              CallStack->rend());
344     CommonLen = std::distance(CallStack->rbegin(), Pos.second);
345   }
346 
347   // Drop the portion beyond CommonLen.
348   assert(CommonLen <= Indexes.size());
349   Indexes.resize(CommonLen);
350 
351   // Append a pointer to the parent.
352   if (CommonLen) {
353     uint32_t CurrentIndex = RadixArray.size();
354     uint32_t ParentIndex = Indexes.back();
355     // The offset to the parent must be negative because we are pointing to an
356     // element we've already added to RadixArray.
357     assert(ParentIndex < CurrentIndex);
358     RadixArray.push_back(ParentIndex - CurrentIndex);
359   }
360 
361   // Copy the part of the call stack beyond the common prefix to RadixArray.
362   assert(CommonLen <= CallStack->size());
363   for (FrameIdTy F : llvm::drop_begin(llvm::reverse(*CallStack), CommonLen)) {
364     // Remember the index of F in RadixArray.
365     Indexes.push_back(RadixArray.size());
366     RadixArray.push_back(
367         MemProfFrameIndexes ? MemProfFrameIndexes->find(F)->second : F);
368   }
369   assert(CallStack->size() == Indexes.size());
370 
371   // End with the call stack length.
372   RadixArray.push_back(CallStack->size());
373 
374   // Return the index within RadixArray where we can start reconstructing a
375   // given call stack from.
376   return RadixArray.size() - 1;
377 }
378 
379 template <typename FrameIdTy>
380 void CallStackRadixTreeBuilder<FrameIdTy>::build(
381     llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>>
382         &&MemProfCallStackData,
383     const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes,
384     llvm::DenseMap<FrameIdTy, FrameStat> &FrameHistogram) {
385   // Take the vector portion of MemProfCallStackData.  The vector is exactly
386   // what we need to sort.  Also, we no longer need its lookup capability.
387   llvm::SmallVector<CSIdPair, 0> CallStacks = MemProfCallStackData.takeVector();
388 
389   // Return early if we have no work to do.
390   if (CallStacks.empty()) {
391     RadixArray.clear();
392     CallStackPos.clear();
393     return;
394   }
395 
396   // Sorting the list of call stacks in the dictionary order is sufficient to
397   // maximize the length of the common prefix between two adjacent call stacks
398   // and thus minimize the length of RadixArray.  However, we go one step
399   // further and try to reduce the number of times we follow pointers to parents
400   // during deserilization.  Consider a poorly encoded radix tree:
401   //
402   // CallStackId 1:  f1 -> f2 -> f3
403   //                  |
404   // CallStackId 2:   +--- f4 -> f5
405   //                        |
406   // CallStackId 3:         +--> f6
407   //
408   // Here, f2 and f4 appear once and twice, respectively, in the call stacks.
409   // Once we encode CallStackId 1 into RadixArray, every other call stack with
410   // common prefix f1 ends up pointing to CallStackId 1.  Since CallStackId 3
411   // share "f1 f4" with CallStackId 2, CallStackId 3 needs to follow pointers to
412   // parents twice.
413   //
414   // We try to alleviate the situation by sorting the list of call stacks by
415   // comparing the popularity of frames rather than the integer values of
416   // FrameIds.  In the example above, f4 is more popular than f2, so we sort the
417   // call stacks and encode them as:
418   //
419   // CallStackId 2:  f1 -- f4 -> f5
420   //                  |     |
421   // CallStackId 3:   |     +--> f6
422   //                  |
423   // CallStackId 1:   +--> f2 -> f3
424   //
425   // Notice that CallStackId 3 follows a pointer to a parent only once.
426   //
427   // All this is a quick-n-dirty trick to reduce the number of jumps.  The
428   // proper way would be to compute the weight of each radix tree node -- how
429   // many call stacks use a given radix tree node, and encode a radix tree from
430   // the heaviest node first.  We do not do so because that's a lot of work.
431   llvm::sort(CallStacks, [&](const CSIdPair &L, const CSIdPair &R) {
432     // Call stacks are stored from leaf to root.  Perform comparisons from the
433     // root.
434     return std::lexicographical_compare(
435         L.second.rbegin(), L.second.rend(), R.second.rbegin(), R.second.rend(),
436         [&](FrameIdTy F1, FrameIdTy F2) {
437           uint64_t H1 = FrameHistogram[F1].Count;
438           uint64_t H2 = FrameHistogram[F2].Count;
439           // Popular frames should come later because we encode call stacks from
440           // the last one in the list.
441           if (H1 != H2)
442             return H1 < H2;
443           // For sort stability.
444           return F1 < F2;
445         });
446   });
447 
448   // Reserve some reasonable amount of storage.
449   RadixArray.clear();
450   RadixArray.reserve(CallStacks.size() * 8);
451 
452   // Indexes will grow as long as the longest call stack.
453   Indexes.clear();
454   Indexes.reserve(512);
455 
456   // CallStackPos will grow to exactly CallStacks.size() entries.
457   CallStackPos.clear();
458   CallStackPos.reserve(CallStacks.size());
459 
460   // Compute the radix array.  We encode one call stack at a time, computing the
461   // longest prefix that's shared with the previous call stack we encode.  For
462   // each call stack we encode, we remember a mapping from CallStackId to its
463   // position within RadixArray.
464   //
465   // As an optimization, we encode from the last call stack in CallStacks to
466   // reduce the number of times we follow pointers to the parents.  Consider the
467   // list of call stacks that has been sorted in the dictionary order:
468   //
469   // Call Stack 1: F1
470   // Call Stack 2: F1 -> F2
471   // Call Stack 3: F1 -> F2 -> F3
472   //
473   // If we traversed CallStacks in the forward order, we would end up with a
474   // radix tree like:
475   //
476   // Call Stack 1:  F1
477   //                |
478   // Call Stack 2:  +---> F2
479   //                      |
480   // Call Stack 3:        +---> F3
481   //
482   // Notice that each call stack jumps to the previous one.  However, if we
483   // traverse CallStacks in the reverse order, then Call Stack 3 has the
484   // complete call stack encoded without any pointers.  Call Stack 1 and 2 point
485   // to appropriate prefixes of Call Stack 3.
486   const llvm::SmallVector<FrameIdTy> *Prev = nullptr;
487   for (const auto &[CSId, CallStack] : llvm::reverse(CallStacks)) {
488     LinearCallStackId Pos =
489         encodeCallStack(&CallStack, Prev, MemProfFrameIndexes);
490     CallStackPos.insert({CSId, Pos});
491     Prev = &CallStack;
492   }
493 
494   // "RadixArray.size() - 1" below is problematic if RadixArray is empty.
495   assert(!RadixArray.empty());
496 
497   // Reverse the radix array in place.  We do so mostly for intuitive
498   // deserialization where we would read the length field and then the call
499   // stack frames proper just like any other array deserialization, except
500   // that we have occasional jumps to take advantage of prefixes.
501   for (size_t I = 0, J = RadixArray.size() - 1; I < J; ++I, --J)
502     std::swap(RadixArray[I], RadixArray[J]);
503 
504   // "Reverse" the indexes stored in CallStackPos.
505   for (auto &[K, V] : CallStackPos)
506     V = RadixArray.size() - 1 - V;
507 }
508 
509 // Explicitly instantiate class with the utilized FrameIdTy.
510 template class CallStackRadixTreeBuilder<FrameId>;
511 template class CallStackRadixTreeBuilder<LinearFrameId>;
512 
513 template <typename FrameIdTy>
514 llvm::DenseMap<FrameIdTy, FrameStat>
515 computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>>
516                           &MemProfCallStackData) {
517   llvm::DenseMap<FrameIdTy, FrameStat> Histogram;
518 
519   for (const auto &KV : MemProfCallStackData) {
520     const auto &CS = KV.second;
521     for (unsigned I = 0, E = CS.size(); I != E; ++I) {
522       auto &S = Histogram[CS[I]];
523       ++S.Count;
524       S.PositionSum += I;
525     }
526   }
527   return Histogram;
528 }
529 
530 // Explicitly instantiate function with the utilized FrameIdTy.
531 template llvm::DenseMap<FrameId, FrameStat> computeFrameHistogram<FrameId>(
532     llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
533         &MemProfCallStackData);
534 template llvm::DenseMap<LinearFrameId, FrameStat>
535 computeFrameHistogram<LinearFrameId>(
536     llvm::MapVector<CallStackId, llvm::SmallVector<LinearFrameId>>
537         &MemProfCallStackData);
538 } // namespace memprof
539 } // namespace llvm
540