10a418490SSnehasish Kumar #include "llvm/ProfileData/MemProf.h" 227a4f254SSnehasish Kumar #include "llvm/ADT/SmallVector.h" 327a4f254SSnehasish Kumar #include "llvm/IR/Function.h" 4fc97efa4Sserge-sans-paille #include "llvm/ProfileData/InstrProf.h" 50edc32fdSSnehasish Kumar #include "llvm/ProfileData/SampleProf.h" 674799f42SKazu Hirata #include "llvm/Support/BLAKE3.h" 70a418490SSnehasish Kumar #include "llvm/Support/Endian.h" 80a418490SSnehasish Kumar #include "llvm/Support/EndianStream.h" 974799f42SKazu Hirata #include "llvm/Support/HashBuilder.h" 100a418490SSnehasish Kumar 110a418490SSnehasish Kumar namespace llvm { 120a418490SSnehasish Kumar namespace memprof { 13cb9589b2SKazu Hirata MemProfSchema getFullSchema() { 14cb9589b2SKazu Hirata MemProfSchema List; 15cb9589b2SKazu Hirata #define MIBEntryDef(NameTag, Name, Type) List.push_back(Meta::Name); 16cb9589b2SKazu Hirata #include "llvm/ProfileData/MIBEntryDef.inc" 17cb9589b2SKazu Hirata #undef MIBEntryDef 18cb9589b2SKazu Hirata return List; 19cb9589b2SKazu Hirata } 20cb9589b2SKazu Hirata 21cb9589b2SKazu Hirata MemProfSchema getHotColdSchema() { 22cb9589b2SKazu Hirata return {Meta::AllocCount, Meta::TotalSize, Meta::TotalLifetime, 23cb9589b2SKazu Hirata Meta::TotalLifetimeAccessDensity}; 24cb9589b2SKazu Hirata } 25cb9589b2SKazu Hirata 26edf733bcSKazu Hirata static size_t serializedSizeV2(const IndexedAllocationInfo &IAI, 27edf733bcSKazu Hirata const MemProfSchema &Schema) { 28d89914f3SKazu Hirata size_t Size = 0; 29d89914f3SKazu Hirata // The CallStackId 30d89914f3SKazu Hirata Size += sizeof(CallStackId); 31d89914f3SKazu Hirata // The size of the payload. 32edf733bcSKazu Hirata Size += PortableMemInfoBlock::serializedSize(Schema); 33d89914f3SKazu Hirata return Size; 34d89914f3SKazu Hirata } 35d89914f3SKazu Hirata 3637f30234SKazu Hirata static size_t serializedSizeV3(const IndexedAllocationInfo &IAI, 3737f30234SKazu Hirata const MemProfSchema &Schema) { 3837f30234SKazu Hirata size_t Size = 0; 3937f30234SKazu Hirata // The linear call stack ID. 4037f30234SKazu Hirata Size += sizeof(LinearCallStackId); 4137f30234SKazu Hirata // The size of the payload. 4237f30234SKazu Hirata Size += PortableMemInfoBlock::serializedSize(Schema); 4337f30234SKazu Hirata return Size; 4437f30234SKazu Hirata } 4537f30234SKazu Hirata 46edf733bcSKazu Hirata size_t IndexedAllocationInfo::serializedSize(const MemProfSchema &Schema, 47edf733bcSKazu Hirata IndexedVersion Version) const { 48d89914f3SKazu Hirata switch (Version) { 49d89914f3SKazu Hirata case Version2: 50edf733bcSKazu Hirata return serializedSizeV2(*this, Schema); 5137f30234SKazu Hirata case Version3: 5237f30234SKazu Hirata return serializedSizeV3(*this, Schema); 53d89914f3SKazu Hirata } 54d89914f3SKazu Hirata llvm_unreachable("unsupported MemProf version"); 55d89914f3SKazu Hirata } 56d89914f3SKazu Hirata 57edf733bcSKazu Hirata static size_t serializedSizeV2(const IndexedMemProfRecord &Record, 58edf733bcSKazu Hirata const MemProfSchema &Schema) { 59e09245b3SKazu Hirata // The number of alloc sites to serialize. 60e09245b3SKazu Hirata size_t Result = sizeof(uint64_t); 61d89914f3SKazu Hirata for (const IndexedAllocationInfo &N : Record.AllocSites) 62edf733bcSKazu Hirata Result += N.serializedSize(Schema, Version2); 63d89914f3SKazu Hirata 64d89914f3SKazu Hirata // The number of callsites we have information for. 65d89914f3SKazu Hirata Result += sizeof(uint64_t); 66d89914f3SKazu Hirata // The CallStackId 67d89914f3SKazu Hirata Result += Record.CallSiteIds.size() * sizeof(CallStackId); 68d89914f3SKazu Hirata return Result; 69d89914f3SKazu Hirata } 70d89914f3SKazu Hirata 7137f30234SKazu Hirata static size_t serializedSizeV3(const IndexedMemProfRecord &Record, 7237f30234SKazu Hirata const MemProfSchema &Schema) { 7337f30234SKazu Hirata // The number of alloc sites to serialize. 7437f30234SKazu Hirata size_t Result = sizeof(uint64_t); 7537f30234SKazu Hirata for (const IndexedAllocationInfo &N : Record.AllocSites) 7637f30234SKazu Hirata Result += N.serializedSize(Schema, Version3); 7737f30234SKazu Hirata 7837f30234SKazu Hirata // The number of callsites we have information for. 7937f30234SKazu Hirata Result += sizeof(uint64_t); 8037f30234SKazu Hirata // The linear call stack ID. 8137f30234SKazu Hirata Result += Record.CallSiteIds.size() * sizeof(LinearCallStackId); 8237f30234SKazu Hirata return Result; 8337f30234SKazu Hirata } 8437f30234SKazu Hirata 85edf733bcSKazu Hirata size_t IndexedMemProfRecord::serializedSize(const MemProfSchema &Schema, 86edf733bcSKazu Hirata IndexedVersion Version) const { 87d89914f3SKazu Hirata switch (Version) { 88d89914f3SKazu Hirata case Version2: 89edf733bcSKazu Hirata return serializedSizeV2(*this, Schema); 9037f30234SKazu Hirata case Version3: 9137f30234SKazu Hirata return serializedSizeV3(*this, Schema); 92d89914f3SKazu Hirata } 93d89914f3SKazu Hirata llvm_unreachable("unsupported MemProf version"); 94d89914f3SKazu Hirata } 95d89914f3SKazu Hirata 963f16ff4eSKazu Hirata static void serializeV2(const IndexedMemProfRecord &Record, 97d89914f3SKazu Hirata const MemProfSchema &Schema, raw_ostream &OS) { 98d89914f3SKazu Hirata using namespace support; 99d89914f3SKazu Hirata 100d89914f3SKazu Hirata endian::Writer LE(OS, llvm::endianness::little); 101d89914f3SKazu Hirata 102d89914f3SKazu Hirata LE.write<uint64_t>(Record.AllocSites.size()); 103d89914f3SKazu Hirata for (const IndexedAllocationInfo &N : Record.AllocSites) { 104d89914f3SKazu Hirata LE.write<CallStackId>(N.CSId); 105d89914f3SKazu Hirata N.Info.serialize(Schema, OS); 106d89914f3SKazu Hirata } 107d89914f3SKazu Hirata 108d89914f3SKazu Hirata // Related contexts. 109d89914f3SKazu Hirata LE.write<uint64_t>(Record.CallSiteIds.size()); 110d89914f3SKazu Hirata for (const auto &CSId : Record.CallSiteIds) 111d89914f3SKazu Hirata LE.write<CallStackId>(CSId); 112d89914f3SKazu Hirata } 113d89914f3SKazu Hirata 1149a8b73c7SKazu Hirata static void serializeV3( 1159a8b73c7SKazu Hirata const IndexedMemProfRecord &Record, const MemProfSchema &Schema, 11690acfbf9SKazu Hirata raw_ostream &OS, 1179a8b73c7SKazu Hirata llvm::DenseMap<CallStackId, LinearCallStackId> &MemProfCallStackIndexes) { 11890acfbf9SKazu Hirata using namespace support; 11990acfbf9SKazu Hirata 12090acfbf9SKazu Hirata endian::Writer LE(OS, llvm::endianness::little); 12190acfbf9SKazu Hirata 12290acfbf9SKazu Hirata LE.write<uint64_t>(Record.AllocSites.size()); 12390acfbf9SKazu Hirata for (const IndexedAllocationInfo &N : Record.AllocSites) { 12490acfbf9SKazu Hirata assert(MemProfCallStackIndexes.contains(N.CSId)); 1259a8b73c7SKazu Hirata LE.write<LinearCallStackId>(MemProfCallStackIndexes[N.CSId]); 12690acfbf9SKazu Hirata N.Info.serialize(Schema, OS); 12790acfbf9SKazu Hirata } 12890acfbf9SKazu Hirata 12990acfbf9SKazu Hirata // Related contexts. 13090acfbf9SKazu Hirata LE.write<uint64_t>(Record.CallSiteIds.size()); 13190acfbf9SKazu Hirata for (const auto &CSId : Record.CallSiteIds) { 13290acfbf9SKazu Hirata assert(MemProfCallStackIndexes.contains(CSId)); 1339a8b73c7SKazu Hirata LE.write<LinearCallStackId>(MemProfCallStackIndexes[CSId]); 13490acfbf9SKazu Hirata } 13590acfbf9SKazu Hirata } 13690acfbf9SKazu Hirata 13790acfbf9SKazu Hirata void IndexedMemProfRecord::serialize( 13890acfbf9SKazu Hirata const MemProfSchema &Schema, raw_ostream &OS, IndexedVersion Version, 139bfa937a4SKazu Hirata llvm::DenseMap<CallStackId, LinearCallStackId> *MemProfCallStackIndexes) 140bfa937a4SKazu Hirata const { 141d89914f3SKazu Hirata switch (Version) { 142d89914f3SKazu Hirata case Version2: 143d89914f3SKazu Hirata serializeV2(*this, Schema, OS); 144d89914f3SKazu Hirata return; 14590acfbf9SKazu Hirata case Version3: 14690acfbf9SKazu Hirata serializeV3(*this, Schema, OS, *MemProfCallStackIndexes); 14790acfbf9SKazu Hirata return; 148d89914f3SKazu Hirata } 149d89914f3SKazu Hirata llvm_unreachable("unsupported MemProf version"); 150d89914f3SKazu Hirata } 151d89914f3SKazu Hirata 1523f16ff4eSKazu Hirata static IndexedMemProfRecord deserializeV2(const MemProfSchema &Schema, 153d89914f3SKazu Hirata const unsigned char *Ptr) { 154d89914f3SKazu Hirata using namespace support; 155d89914f3SKazu Hirata 156d89914f3SKazu Hirata IndexedMemProfRecord Record; 157d89914f3SKazu Hirata 158d89914f3SKazu Hirata // Read the meminfo nodes. 159d89914f3SKazu Hirata const uint64_t NumNodes = 160f430e374SKazu Hirata endian::readNext<uint64_t, llvm::endianness::little>(Ptr); 1618d2258fdSKazu Hirata Record.AllocSites.reserve(NumNodes); 162d89914f3SKazu Hirata for (uint64_t I = 0; I < NumNodes; I++) { 163d89914f3SKazu Hirata IndexedAllocationInfo Node; 164f430e374SKazu Hirata Node.CSId = endian::readNext<CallStackId, llvm::endianness::little>(Ptr); 165d89914f3SKazu Hirata Node.Info.deserialize(Schema, Ptr); 166edf733bcSKazu Hirata Ptr += PortableMemInfoBlock::serializedSize(Schema); 167d89914f3SKazu Hirata Record.AllocSites.push_back(Node); 168d89914f3SKazu Hirata } 169d89914f3SKazu Hirata 170d89914f3SKazu Hirata // Read the callsite information. 171d89914f3SKazu Hirata const uint64_t NumCtxs = 172f430e374SKazu Hirata endian::readNext<uint64_t, llvm::endianness::little>(Ptr); 1738d2258fdSKazu Hirata Record.CallSiteIds.reserve(NumCtxs); 174d89914f3SKazu Hirata for (uint64_t J = 0; J < NumCtxs; J++) { 175d89914f3SKazu Hirata CallStackId CSId = 176f430e374SKazu Hirata endian::readNext<CallStackId, llvm::endianness::little>(Ptr); 177d89914f3SKazu Hirata Record.CallSiteIds.push_back(CSId); 178d89914f3SKazu Hirata } 179d89914f3SKazu Hirata 180d89914f3SKazu Hirata return Record; 181d89914f3SKazu Hirata } 182d89914f3SKazu Hirata 18337f30234SKazu Hirata static IndexedMemProfRecord deserializeV3(const MemProfSchema &Schema, 18437f30234SKazu Hirata const unsigned char *Ptr) { 18537f30234SKazu Hirata using namespace support; 18637f30234SKazu Hirata 18737f30234SKazu Hirata IndexedMemProfRecord Record; 18837f30234SKazu Hirata 18937f30234SKazu Hirata // Read the meminfo nodes. 19037f30234SKazu Hirata const uint64_t NumNodes = 19137f30234SKazu Hirata endian::readNext<uint64_t, llvm::endianness::little>(Ptr); 19237f30234SKazu Hirata Record.AllocSites.reserve(NumNodes); 19337f30234SKazu Hirata for (uint64_t I = 0; I < NumNodes; I++) { 19437f30234SKazu Hirata IndexedAllocationInfo Node; 1959a8b73c7SKazu Hirata Node.CSId = 1969a8b73c7SKazu Hirata endian::readNext<LinearCallStackId, llvm::endianness::little>(Ptr); 19737f30234SKazu Hirata Node.Info.deserialize(Schema, Ptr); 19837f30234SKazu Hirata Ptr += PortableMemInfoBlock::serializedSize(Schema); 19937f30234SKazu Hirata Record.AllocSites.push_back(Node); 20037f30234SKazu Hirata } 20137f30234SKazu Hirata 20237f30234SKazu Hirata // Read the callsite information. 20337f30234SKazu Hirata const uint64_t NumCtxs = 20437f30234SKazu Hirata endian::readNext<uint64_t, llvm::endianness::little>(Ptr); 20537f30234SKazu Hirata Record.CallSiteIds.reserve(NumCtxs); 20637f30234SKazu Hirata for (uint64_t J = 0; J < NumCtxs; J++) { 20737f30234SKazu Hirata // We are storing LinearCallStackId in CallSiteIds, which is a vector of 20837f30234SKazu Hirata // CallStackId. Assert that CallStackId is no smaller than 20937f30234SKazu Hirata // LinearCallStackId. 21037f30234SKazu Hirata static_assert(sizeof(LinearCallStackId) <= sizeof(CallStackId)); 21137f30234SKazu Hirata LinearCallStackId CSId = 21237f30234SKazu Hirata endian::readNext<LinearCallStackId, llvm::endianness::little>(Ptr); 21337f30234SKazu Hirata Record.CallSiteIds.push_back(CSId); 21437f30234SKazu Hirata } 21537f30234SKazu Hirata 21637f30234SKazu Hirata return Record; 21737f30234SKazu Hirata } 21837f30234SKazu Hirata 219d89914f3SKazu Hirata IndexedMemProfRecord 220d89914f3SKazu Hirata IndexedMemProfRecord::deserialize(const MemProfSchema &Schema, 221d89914f3SKazu Hirata const unsigned char *Ptr, 222d89914f3SKazu Hirata IndexedVersion Version) { 223d89914f3SKazu Hirata switch (Version) { 224d89914f3SKazu Hirata case Version2: 225d89914f3SKazu Hirata return deserializeV2(Schema, Ptr); 22637f30234SKazu Hirata case Version3: 22737f30234SKazu Hirata return deserializeV3(Schema, Ptr); 228d89914f3SKazu Hirata } 229d89914f3SKazu Hirata llvm_unreachable("unsupported MemProf version"); 230d89914f3SKazu Hirata } 231d89914f3SKazu Hirata 2328137bd9eSKazu Hirata MemProfRecord IndexedMemProfRecord::toMemProfRecord( 2334a918f07SKazu Hirata llvm::function_ref<std::vector<Frame>(const CallStackId)> Callback) const { 2348137bd9eSKazu Hirata MemProfRecord Record; 2358137bd9eSKazu Hirata 236300663a1SKazu Hirata Record.AllocSites.reserve(AllocSites.size()); 2374e0ff054SKazu Hirata for (const IndexedAllocationInfo &IndexedAI : AllocSites) { 2384e0ff054SKazu Hirata AllocationInfo AI; 2398137bd9eSKazu Hirata AI.Info = IndexedAI.Info; 2408137bd9eSKazu Hirata AI.CallStack = Callback(IndexedAI.CSId); 241300663a1SKazu Hirata Record.AllocSites.push_back(std::move(AI)); 2428137bd9eSKazu Hirata } 2438137bd9eSKazu Hirata 244300663a1SKazu Hirata Record.CallSites.reserve(CallSiteIds.size()); 2454e0ff054SKazu Hirata for (CallStackId CSId : CallSiteIds) 2468137bd9eSKazu Hirata Record.CallSites.push_back(Callback(CSId)); 2478137bd9eSKazu Hirata 2488137bd9eSKazu Hirata return Record; 2498137bd9eSKazu Hirata } 2508137bd9eSKazu Hirata 2516dd6a616SSnehasish Kumar GlobalValue::GUID IndexedMemProfRecord::getGUID(const StringRef FunctionName) { 252749d595dSTeresa Johnson // Canonicalize the function name to drop suffixes such as ".llvm.". Note 253749d595dSTeresa Johnson // we do not drop any ".__uniq." suffixes, as getCanonicalFnName does not drop 254749d595dSTeresa Johnson // those by default. This is by design to differentiate internal linkage 255749d595dSTeresa Johnson // functions during matching. By dropping the other suffixes we can then match 256749d595dSTeresa Johnson // functions in the profile use phase prior to their addition. Note that this 257749d595dSTeresa Johnson // applies to both instrumented and sampled function names. 2580edc32fdSSnehasish Kumar StringRef CanonicalName = 2590edc32fdSSnehasish Kumar sampleprof::FunctionSamples::getCanonicalFnName(FunctionName); 26027a4f254SSnehasish Kumar 26127a4f254SSnehasish Kumar // We use the function guid which we expect to be a uint64_t. At 2620edc32fdSSnehasish Kumar // this time, it is the lower 64 bits of the md5 of the canonical 2630edc32fdSSnehasish Kumar // function name. 2640edc32fdSSnehasish Kumar return Function::getGUID(CanonicalName); 2650a418490SSnehasish Kumar } 2660a418490SSnehasish Kumar 2670a418490SSnehasish Kumar Expected<MemProfSchema> readMemProfSchema(const unsigned char *&Buffer) { 2680a418490SSnehasish Kumar using namespace support; 2690a418490SSnehasish Kumar 2700a418490SSnehasish Kumar const unsigned char *Ptr = Buffer; 2710a418490SSnehasish Kumar const uint64_t NumSchemaIds = 272f430e374SKazu Hirata endian::readNext<uint64_t, llvm::endianness::little>(Ptr); 2730a418490SSnehasish Kumar if (NumSchemaIds > static_cast<uint64_t>(Meta::Size)) { 2740a418490SSnehasish Kumar return make_error<InstrProfError>(instrprof_error::malformed, 2750a418490SSnehasish Kumar "memprof schema invalid"); 2760a418490SSnehasish Kumar } 2770a418490SSnehasish Kumar 2780a418490SSnehasish Kumar MemProfSchema Result; 2790a418490SSnehasish Kumar for (size_t I = 0; I < NumSchemaIds; I++) { 28002f67c09SKazu Hirata const uint64_t Tag = 281f430e374SKazu Hirata endian::readNext<uint64_t, llvm::endianness::little>(Ptr); 2820a418490SSnehasish Kumar if (Tag >= static_cast<uint64_t>(Meta::Size)) { 2830a418490SSnehasish Kumar return make_error<InstrProfError>(instrprof_error::malformed, 2840a418490SSnehasish Kumar "memprof schema invalid"); 2850a418490SSnehasish Kumar } 2860a418490SSnehasish Kumar Result.push_back(static_cast<Meta>(Tag)); 2870a418490SSnehasish Kumar } 288e2d539bbSKazu Hirata // Advance the buffer to one past the schema if we succeeded. 2890a418490SSnehasish Kumar Buffer = Ptr; 2900a418490SSnehasish Kumar return Result; 2910a418490SSnehasish Kumar } 2920a418490SSnehasish Kumar 293*6fb967ecSKazu Hirata CallStackId IndexedMemProfData::hashCallStack(ArrayRef<FrameId> CS) const { 29474799f42SKazu Hirata llvm::HashBuilder<llvm::TruncatedBLAKE3<8>, llvm::endianness::little> 29574799f42SKazu Hirata HashBuilder; 29674799f42SKazu Hirata for (FrameId F : CS) 29774799f42SKazu Hirata HashBuilder.add(F); 29874799f42SKazu Hirata llvm::BLAKE3Result<8> Hash = HashBuilder.final(); 29974799f42SKazu Hirata CallStackId CSId; 30074799f42SKazu Hirata std::memcpy(&CSId, Hash.data(), sizeof(Hash)); 30174799f42SKazu Hirata return CSId; 30274799f42SKazu Hirata } 30374799f42SKazu Hirata 3045c0df5feSKazu Hirata // Encode a call stack into RadixArray. Return the starting index within 3055c0df5feSKazu Hirata // RadixArray. For each call stack we encode, we emit two or three components 3065c0df5feSKazu Hirata // into RadixArray. If a given call stack doesn't have a common prefix relative 3075c0df5feSKazu Hirata // to the previous one, we emit: 3085c0df5feSKazu Hirata // 3095c0df5feSKazu Hirata // - the frames in the given call stack in the root-to-leaf order 3105c0df5feSKazu Hirata // 3115c0df5feSKazu Hirata // - the length of the given call stack 3125c0df5feSKazu Hirata // 3135c0df5feSKazu Hirata // If a given call stack has a non-empty common prefix relative to the previous 3145c0df5feSKazu Hirata // one, we emit: 3155c0df5feSKazu Hirata // 3165c0df5feSKazu Hirata // - the relative location of the common prefix, encoded as a negative number. 3175c0df5feSKazu Hirata // 3185c0df5feSKazu Hirata // - a portion of the given call stack that's beyond the common prefix 3195c0df5feSKazu Hirata // 3205c0df5feSKazu Hirata // - the length of the given call stack, including the length of the common 3215c0df5feSKazu Hirata // prefix. 3225c0df5feSKazu Hirata // 3235c0df5feSKazu Hirata // The resulting RadixArray requires a somewhat unintuitive backward traversal 3245c0df5feSKazu Hirata // to reconstruct a call stack -- read the call stack length and scan backward 3255c0df5feSKazu Hirata // while collecting frames in the leaf to root order. build, the caller of this 3265c0df5feSKazu Hirata // function, reverses RadixArray in place so that we can reconstruct a call 3275c0df5feSKazu Hirata // stack as if we were deserializing an array in a typical way -- the call stack 3285c0df5feSKazu Hirata // length followed by the frames in the leaf-to-root order except that we need 3295c0df5feSKazu Hirata // to handle pointers to parents along the way. 3305c0df5feSKazu Hirata // 3315c0df5feSKazu Hirata // To quickly determine the location of the common prefix within RadixArray, 3325c0df5feSKazu Hirata // Indexes caches the indexes of the previous call stack's frames within 3335c0df5feSKazu Hirata // RadixArray. 334e14827f0STeresa Johnson template <typename FrameIdTy> 335e14827f0STeresa Johnson LinearCallStackId CallStackRadixTreeBuilder<FrameIdTy>::encodeCallStack( 336e14827f0STeresa Johnson const llvm::SmallVector<FrameIdTy> *CallStack, 337e14827f0STeresa Johnson const llvm::SmallVector<FrameIdTy> *Prev, 338ff7b42c1SKazu Hirata const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes) { 3395c0df5feSKazu Hirata // Compute the length of the common root prefix between Prev and CallStack. 3405c0df5feSKazu Hirata uint32_t CommonLen = 0; 3415c0df5feSKazu Hirata if (Prev) { 3425c0df5feSKazu Hirata auto Pos = std::mismatch(Prev->rbegin(), Prev->rend(), CallStack->rbegin(), 3435c0df5feSKazu Hirata CallStack->rend()); 3445c0df5feSKazu Hirata CommonLen = std::distance(CallStack->rbegin(), Pos.second); 3455c0df5feSKazu Hirata } 3465c0df5feSKazu Hirata 3475c0df5feSKazu Hirata // Drop the portion beyond CommonLen. 3485c0df5feSKazu Hirata assert(CommonLen <= Indexes.size()); 3495c0df5feSKazu Hirata Indexes.resize(CommonLen); 3505c0df5feSKazu Hirata 3515c0df5feSKazu Hirata // Append a pointer to the parent. 3525c0df5feSKazu Hirata if (CommonLen) { 3535c0df5feSKazu Hirata uint32_t CurrentIndex = RadixArray.size(); 3545c0df5feSKazu Hirata uint32_t ParentIndex = Indexes.back(); 3555c0df5feSKazu Hirata // The offset to the parent must be negative because we are pointing to an 3565c0df5feSKazu Hirata // element we've already added to RadixArray. 3575c0df5feSKazu Hirata assert(ParentIndex < CurrentIndex); 3585c0df5feSKazu Hirata RadixArray.push_back(ParentIndex - CurrentIndex); 3595c0df5feSKazu Hirata } 3605c0df5feSKazu Hirata 3615c0df5feSKazu Hirata // Copy the part of the call stack beyond the common prefix to RadixArray. 3625c0df5feSKazu Hirata assert(CommonLen <= CallStack->size()); 363e14827f0STeresa Johnson for (FrameIdTy F : llvm::drop_begin(llvm::reverse(*CallStack), CommonLen)) { 3645c0df5feSKazu Hirata // Remember the index of F in RadixArray. 3655c0df5feSKazu Hirata Indexes.push_back(RadixArray.size()); 366e14827f0STeresa Johnson RadixArray.push_back( 367e14827f0STeresa Johnson MemProfFrameIndexes ? MemProfFrameIndexes->find(F)->second : F); 3685c0df5feSKazu Hirata } 3695c0df5feSKazu Hirata assert(CallStack->size() == Indexes.size()); 3705c0df5feSKazu Hirata 3715c0df5feSKazu Hirata // End with the call stack length. 3725c0df5feSKazu Hirata RadixArray.push_back(CallStack->size()); 3735c0df5feSKazu Hirata 3745c0df5feSKazu Hirata // Return the index within RadixArray where we can start reconstructing a 3755c0df5feSKazu Hirata // given call stack from. 3765c0df5feSKazu Hirata return RadixArray.size() - 1; 3775c0df5feSKazu Hirata } 3785c0df5feSKazu Hirata 379e14827f0STeresa Johnson template <typename FrameIdTy> 380e14827f0STeresa Johnson void CallStackRadixTreeBuilder<FrameIdTy>::build( 381e14827f0STeresa Johnson llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>> 3825c0df5feSKazu Hirata &&MemProfCallStackData, 383ff7b42c1SKazu Hirata const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes, 384e14827f0STeresa Johnson llvm::DenseMap<FrameIdTy, FrameStat> &FrameHistogram) { 3855c0df5feSKazu Hirata // Take the vector portion of MemProfCallStackData. The vector is exactly 3865c0df5feSKazu Hirata // what we need to sort. Also, we no longer need its lookup capability. 3875c0df5feSKazu Hirata llvm::SmallVector<CSIdPair, 0> CallStacks = MemProfCallStackData.takeVector(); 3885c0df5feSKazu Hirata 3895c0df5feSKazu Hirata // Return early if we have no work to do. 3905c0df5feSKazu Hirata if (CallStacks.empty()) { 3915c0df5feSKazu Hirata RadixArray.clear(); 3925c0df5feSKazu Hirata CallStackPos.clear(); 3935c0df5feSKazu Hirata return; 3945c0df5feSKazu Hirata } 3955c0df5feSKazu Hirata 396dc3f8c2fSKazu Hirata // Sorting the list of call stacks in the dictionary order is sufficient to 397dc3f8c2fSKazu Hirata // maximize the length of the common prefix between two adjacent call stacks 398dc3f8c2fSKazu Hirata // and thus minimize the length of RadixArray. However, we go one step 399dc3f8c2fSKazu Hirata // further and try to reduce the number of times we follow pointers to parents 400dc3f8c2fSKazu Hirata // during deserilization. Consider a poorly encoded radix tree: 401dc3f8c2fSKazu Hirata // 402dc3f8c2fSKazu Hirata // CallStackId 1: f1 -> f2 -> f3 403dc3f8c2fSKazu Hirata // | 404dc3f8c2fSKazu Hirata // CallStackId 2: +--- f4 -> f5 405dc3f8c2fSKazu Hirata // | 406dc3f8c2fSKazu Hirata // CallStackId 3: +--> f6 407dc3f8c2fSKazu Hirata // 408dc3f8c2fSKazu Hirata // Here, f2 and f4 appear once and twice, respectively, in the call stacks. 409dc3f8c2fSKazu Hirata // Once we encode CallStackId 1 into RadixArray, every other call stack with 410dc3f8c2fSKazu Hirata // common prefix f1 ends up pointing to CallStackId 1. Since CallStackId 3 411dc3f8c2fSKazu Hirata // share "f1 f4" with CallStackId 2, CallStackId 3 needs to follow pointers to 412dc3f8c2fSKazu Hirata // parents twice. 413dc3f8c2fSKazu Hirata // 414dc3f8c2fSKazu Hirata // We try to alleviate the situation by sorting the list of call stacks by 415dc3f8c2fSKazu Hirata // comparing the popularity of frames rather than the integer values of 416dc3f8c2fSKazu Hirata // FrameIds. In the example above, f4 is more popular than f2, so we sort the 417dc3f8c2fSKazu Hirata // call stacks and encode them as: 418dc3f8c2fSKazu Hirata // 419dc3f8c2fSKazu Hirata // CallStackId 2: f1 -- f4 -> f5 420dc3f8c2fSKazu Hirata // | | 421dc3f8c2fSKazu Hirata // CallStackId 3: | +--> f6 422dc3f8c2fSKazu Hirata // | 423dc3f8c2fSKazu Hirata // CallStackId 1: +--> f2 -> f3 424dc3f8c2fSKazu Hirata // 425dc3f8c2fSKazu Hirata // Notice that CallStackId 3 follows a pointer to a parent only once. 426dc3f8c2fSKazu Hirata // 427dc3f8c2fSKazu Hirata // All this is a quick-n-dirty trick to reduce the number of jumps. The 428dc3f8c2fSKazu Hirata // proper way would be to compute the weight of each radix tree node -- how 429dc3f8c2fSKazu Hirata // many call stacks use a given radix tree node, and encode a radix tree from 430dc3f8c2fSKazu Hirata // the heaviest node first. We do not do so because that's a lot of work. 4315c0df5feSKazu Hirata llvm::sort(CallStacks, [&](const CSIdPair &L, const CSIdPair &R) { 4325c0df5feSKazu Hirata // Call stacks are stored from leaf to root. Perform comparisons from the 4335c0df5feSKazu Hirata // root. 4345c0df5feSKazu Hirata return std::lexicographical_compare( 4355c0df5feSKazu Hirata L.second.rbegin(), L.second.rend(), R.second.rbegin(), R.second.rend(), 436e14827f0STeresa Johnson [&](FrameIdTy F1, FrameIdTy F2) { 437dc3f8c2fSKazu Hirata uint64_t H1 = FrameHistogram[F1].Count; 438dc3f8c2fSKazu Hirata uint64_t H2 = FrameHistogram[F2].Count; 439dc3f8c2fSKazu Hirata // Popular frames should come later because we encode call stacks from 440dc3f8c2fSKazu Hirata // the last one in the list. 441dc3f8c2fSKazu Hirata if (H1 != H2) 442dc3f8c2fSKazu Hirata return H1 < H2; 443dc3f8c2fSKazu Hirata // For sort stability. 444dc3f8c2fSKazu Hirata return F1 < F2; 445dc3f8c2fSKazu Hirata }); 4465c0df5feSKazu Hirata }); 4475c0df5feSKazu Hirata 4485c0df5feSKazu Hirata // Reserve some reasonable amount of storage. 4495c0df5feSKazu Hirata RadixArray.clear(); 4505c0df5feSKazu Hirata RadixArray.reserve(CallStacks.size() * 8); 4515c0df5feSKazu Hirata 4525c0df5feSKazu Hirata // Indexes will grow as long as the longest call stack. 4535c0df5feSKazu Hirata Indexes.clear(); 4545c0df5feSKazu Hirata Indexes.reserve(512); 4555c0df5feSKazu Hirata 4565c0df5feSKazu Hirata // CallStackPos will grow to exactly CallStacks.size() entries. 4575c0df5feSKazu Hirata CallStackPos.clear(); 4585c0df5feSKazu Hirata CallStackPos.reserve(CallStacks.size()); 4595c0df5feSKazu Hirata 4605c0df5feSKazu Hirata // Compute the radix array. We encode one call stack at a time, computing the 4615c0df5feSKazu Hirata // longest prefix that's shared with the previous call stack we encode. For 4625c0df5feSKazu Hirata // each call stack we encode, we remember a mapping from CallStackId to its 4635c0df5feSKazu Hirata // position within RadixArray. 4645c0df5feSKazu Hirata // 4655c0df5feSKazu Hirata // As an optimization, we encode from the last call stack in CallStacks to 4665c0df5feSKazu Hirata // reduce the number of times we follow pointers to the parents. Consider the 4675c0df5feSKazu Hirata // list of call stacks that has been sorted in the dictionary order: 4685c0df5feSKazu Hirata // 4695c0df5feSKazu Hirata // Call Stack 1: F1 4705c0df5feSKazu Hirata // Call Stack 2: F1 -> F2 4715c0df5feSKazu Hirata // Call Stack 3: F1 -> F2 -> F3 4725c0df5feSKazu Hirata // 4735c0df5feSKazu Hirata // If we traversed CallStacks in the forward order, we would end up with a 4745c0df5feSKazu Hirata // radix tree like: 4755c0df5feSKazu Hirata // 4765c0df5feSKazu Hirata // Call Stack 1: F1 4775c0df5feSKazu Hirata // | 4785c0df5feSKazu Hirata // Call Stack 2: +---> F2 4795c0df5feSKazu Hirata // | 4805c0df5feSKazu Hirata // Call Stack 3: +---> F3 4815c0df5feSKazu Hirata // 4825c0df5feSKazu Hirata // Notice that each call stack jumps to the previous one. However, if we 4835c0df5feSKazu Hirata // traverse CallStacks in the reverse order, then Call Stack 3 has the 4845c0df5feSKazu Hirata // complete call stack encoded without any pointers. Call Stack 1 and 2 point 4855c0df5feSKazu Hirata // to appropriate prefixes of Call Stack 3. 486e14827f0STeresa Johnson const llvm::SmallVector<FrameIdTy> *Prev = nullptr; 4875c0df5feSKazu Hirata for (const auto &[CSId, CallStack] : llvm::reverse(CallStacks)) { 4885c0df5feSKazu Hirata LinearCallStackId Pos = 4895c0df5feSKazu Hirata encodeCallStack(&CallStack, Prev, MemProfFrameIndexes); 4905c0df5feSKazu Hirata CallStackPos.insert({CSId, Pos}); 4915c0df5feSKazu Hirata Prev = &CallStack; 4925c0df5feSKazu Hirata } 4935c0df5feSKazu Hirata 4945c0df5feSKazu Hirata // "RadixArray.size() - 1" below is problematic if RadixArray is empty. 4955c0df5feSKazu Hirata assert(!RadixArray.empty()); 4965c0df5feSKazu Hirata 4975c0df5feSKazu Hirata // Reverse the radix array in place. We do so mostly for intuitive 4985c0df5feSKazu Hirata // deserialization where we would read the length field and then the call 4995c0df5feSKazu Hirata // stack frames proper just like any other array deserialization, except 5005c0df5feSKazu Hirata // that we have occasional jumps to take advantage of prefixes. 5015c0df5feSKazu Hirata for (size_t I = 0, J = RadixArray.size() - 1; I < J; ++I, --J) 5025c0df5feSKazu Hirata std::swap(RadixArray[I], RadixArray[J]); 5035c0df5feSKazu Hirata 5045c0df5feSKazu Hirata // "Reverse" the indexes stored in CallStackPos. 5055c0df5feSKazu Hirata for (auto &[K, V] : CallStackPos) 5065c0df5feSKazu Hirata V = RadixArray.size() - 1 - V; 5075c0df5feSKazu Hirata } 5085c0df5feSKazu Hirata 509e14827f0STeresa Johnson // Explicitly instantiate class with the utilized FrameIdTy. 510e14827f0STeresa Johnson template class CallStackRadixTreeBuilder<FrameId>; 511776476c2STeresa Johnson template class CallStackRadixTreeBuilder<LinearFrameId>; 512e14827f0STeresa Johnson 513e14827f0STeresa Johnson template <typename FrameIdTy> 514e14827f0STeresa Johnson llvm::DenseMap<FrameIdTy, FrameStat> 515e14827f0STeresa Johnson computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>> 516dc3f8c2fSKazu Hirata &MemProfCallStackData) { 517e14827f0STeresa Johnson llvm::DenseMap<FrameIdTy, FrameStat> Histogram; 518dc3f8c2fSKazu Hirata 519dc3f8c2fSKazu Hirata for (const auto &KV : MemProfCallStackData) { 520dc3f8c2fSKazu Hirata const auto &CS = KV.second; 521dc3f8c2fSKazu Hirata for (unsigned I = 0, E = CS.size(); I != E; ++I) { 522dc3f8c2fSKazu Hirata auto &S = Histogram[CS[I]]; 523dc3f8c2fSKazu Hirata ++S.Count; 524dc3f8c2fSKazu Hirata S.PositionSum += I; 525dc3f8c2fSKazu Hirata } 526dc3f8c2fSKazu Hirata } 527dc3f8c2fSKazu Hirata return Histogram; 528dc3f8c2fSKazu Hirata } 529dc3f8c2fSKazu Hirata 530e14827f0STeresa Johnson // Explicitly instantiate function with the utilized FrameIdTy. 531e14827f0STeresa Johnson template llvm::DenseMap<FrameId, FrameStat> computeFrameHistogram<FrameId>( 532e14827f0STeresa Johnson llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>> 533e14827f0STeresa Johnson &MemProfCallStackData); 534776476c2STeresa Johnson template llvm::DenseMap<LinearFrameId, FrameStat> 535776476c2STeresa Johnson computeFrameHistogram<LinearFrameId>( 536776476c2STeresa Johnson llvm::MapVector<CallStackId, llvm::SmallVector<LinearFrameId>> 537776476c2STeresa Johnson &MemProfCallStackData); 5380a418490SSnehasish Kumar } // namespace memprof 5390a418490SSnehasish Kumar } // namespace llvm 540