1 #include "llvm/ProfileData/MemProf.h" 2 #include "llvm/ADT/SmallVector.h" 3 #include "llvm/IR/Function.h" 4 #include "llvm/ProfileData/InstrProf.h" 5 #include "llvm/ProfileData/SampleProf.h" 6 #include "llvm/Support/BLAKE3.h" 7 #include "llvm/Support/Endian.h" 8 #include "llvm/Support/EndianStream.h" 9 #include "llvm/Support/HashBuilder.h" 10 11 namespace llvm { 12 namespace memprof { 13 MemProfSchema getFullSchema() { 14 MemProfSchema List; 15 #define MIBEntryDef(NameTag, Name, Type) List.push_back(Meta::Name); 16 #include "llvm/ProfileData/MIBEntryDef.inc" 17 #undef MIBEntryDef 18 return List; 19 } 20 21 MemProfSchema getHotColdSchema() { 22 return {Meta::AllocCount, Meta::TotalSize, Meta::TotalLifetime, 23 Meta::TotalLifetimeAccessDensity}; 24 } 25 26 static size_t serializedSizeV2(const IndexedAllocationInfo &IAI, 27 const MemProfSchema &Schema) { 28 size_t Size = 0; 29 // The CallStackId 30 Size += sizeof(CallStackId); 31 // The size of the payload. 32 Size += PortableMemInfoBlock::serializedSize(Schema); 33 return Size; 34 } 35 36 static size_t serializedSizeV3(const IndexedAllocationInfo &IAI, 37 const MemProfSchema &Schema) { 38 size_t Size = 0; 39 // The linear call stack ID. 40 Size += sizeof(LinearCallStackId); 41 // The size of the payload. 42 Size += PortableMemInfoBlock::serializedSize(Schema); 43 return Size; 44 } 45 46 size_t IndexedAllocationInfo::serializedSize(const MemProfSchema &Schema, 47 IndexedVersion Version) const { 48 switch (Version) { 49 case Version2: 50 return serializedSizeV2(*this, Schema); 51 case Version3: 52 return serializedSizeV3(*this, Schema); 53 } 54 llvm_unreachable("unsupported MemProf version"); 55 } 56 57 static size_t serializedSizeV2(const IndexedMemProfRecord &Record, 58 const MemProfSchema &Schema) { 59 // The number of alloc sites to serialize. 60 size_t Result = sizeof(uint64_t); 61 for (const IndexedAllocationInfo &N : Record.AllocSites) 62 Result += N.serializedSize(Schema, Version2); 63 64 // The number of callsites we have information for. 65 Result += sizeof(uint64_t); 66 // The CallStackId 67 Result += Record.CallSiteIds.size() * sizeof(CallStackId); 68 return Result; 69 } 70 71 static size_t serializedSizeV3(const IndexedMemProfRecord &Record, 72 const MemProfSchema &Schema) { 73 // The number of alloc sites to serialize. 74 size_t Result = sizeof(uint64_t); 75 for (const IndexedAllocationInfo &N : Record.AllocSites) 76 Result += N.serializedSize(Schema, Version3); 77 78 // The number of callsites we have information for. 79 Result += sizeof(uint64_t); 80 // The linear call stack ID. 81 Result += Record.CallSiteIds.size() * sizeof(LinearCallStackId); 82 return Result; 83 } 84 85 size_t IndexedMemProfRecord::serializedSize(const MemProfSchema &Schema, 86 IndexedVersion Version) const { 87 switch (Version) { 88 case Version2: 89 return serializedSizeV2(*this, Schema); 90 case Version3: 91 return serializedSizeV3(*this, Schema); 92 } 93 llvm_unreachable("unsupported MemProf version"); 94 } 95 96 static void serializeV2(const IndexedMemProfRecord &Record, 97 const MemProfSchema &Schema, raw_ostream &OS) { 98 using namespace support; 99 100 endian::Writer LE(OS, llvm::endianness::little); 101 102 LE.write<uint64_t>(Record.AllocSites.size()); 103 for (const IndexedAllocationInfo &N : Record.AllocSites) { 104 LE.write<CallStackId>(N.CSId); 105 N.Info.serialize(Schema, OS); 106 } 107 108 // Related contexts. 109 LE.write<uint64_t>(Record.CallSiteIds.size()); 110 for (const auto &CSId : Record.CallSiteIds) 111 LE.write<CallStackId>(CSId); 112 } 113 114 static void serializeV3( 115 const IndexedMemProfRecord &Record, const MemProfSchema &Schema, 116 raw_ostream &OS, 117 llvm::DenseMap<CallStackId, LinearCallStackId> &MemProfCallStackIndexes) { 118 using namespace support; 119 120 endian::Writer LE(OS, llvm::endianness::little); 121 122 LE.write<uint64_t>(Record.AllocSites.size()); 123 for (const IndexedAllocationInfo &N : Record.AllocSites) { 124 assert(MemProfCallStackIndexes.contains(N.CSId)); 125 LE.write<LinearCallStackId>(MemProfCallStackIndexes[N.CSId]); 126 N.Info.serialize(Schema, OS); 127 } 128 129 // Related contexts. 130 LE.write<uint64_t>(Record.CallSiteIds.size()); 131 for (const auto &CSId : Record.CallSiteIds) { 132 assert(MemProfCallStackIndexes.contains(CSId)); 133 LE.write<LinearCallStackId>(MemProfCallStackIndexes[CSId]); 134 } 135 } 136 137 void IndexedMemProfRecord::serialize( 138 const MemProfSchema &Schema, raw_ostream &OS, IndexedVersion Version, 139 llvm::DenseMap<CallStackId, LinearCallStackId> *MemProfCallStackIndexes) 140 const { 141 switch (Version) { 142 case Version2: 143 serializeV2(*this, Schema, OS); 144 return; 145 case Version3: 146 serializeV3(*this, Schema, OS, *MemProfCallStackIndexes); 147 return; 148 } 149 llvm_unreachable("unsupported MemProf version"); 150 } 151 152 static IndexedMemProfRecord deserializeV2(const MemProfSchema &Schema, 153 const unsigned char *Ptr) { 154 using namespace support; 155 156 IndexedMemProfRecord Record; 157 158 // Read the meminfo nodes. 159 const uint64_t NumNodes = 160 endian::readNext<uint64_t, llvm::endianness::little>(Ptr); 161 Record.AllocSites.reserve(NumNodes); 162 for (uint64_t I = 0; I < NumNodes; I++) { 163 IndexedAllocationInfo Node; 164 Node.CSId = endian::readNext<CallStackId, llvm::endianness::little>(Ptr); 165 Node.Info.deserialize(Schema, Ptr); 166 Ptr += PortableMemInfoBlock::serializedSize(Schema); 167 Record.AllocSites.push_back(Node); 168 } 169 170 // Read the callsite information. 171 const uint64_t NumCtxs = 172 endian::readNext<uint64_t, llvm::endianness::little>(Ptr); 173 Record.CallSiteIds.reserve(NumCtxs); 174 for (uint64_t J = 0; J < NumCtxs; J++) { 175 CallStackId CSId = 176 endian::readNext<CallStackId, llvm::endianness::little>(Ptr); 177 Record.CallSiteIds.push_back(CSId); 178 } 179 180 return Record; 181 } 182 183 static IndexedMemProfRecord deserializeV3(const MemProfSchema &Schema, 184 const unsigned char *Ptr) { 185 using namespace support; 186 187 IndexedMemProfRecord Record; 188 189 // Read the meminfo nodes. 190 const uint64_t NumNodes = 191 endian::readNext<uint64_t, llvm::endianness::little>(Ptr); 192 Record.AllocSites.reserve(NumNodes); 193 for (uint64_t I = 0; I < NumNodes; I++) { 194 IndexedAllocationInfo Node; 195 Node.CSId = 196 endian::readNext<LinearCallStackId, llvm::endianness::little>(Ptr); 197 Node.Info.deserialize(Schema, Ptr); 198 Ptr += PortableMemInfoBlock::serializedSize(Schema); 199 Record.AllocSites.push_back(Node); 200 } 201 202 // Read the callsite information. 203 const uint64_t NumCtxs = 204 endian::readNext<uint64_t, llvm::endianness::little>(Ptr); 205 Record.CallSiteIds.reserve(NumCtxs); 206 for (uint64_t J = 0; J < NumCtxs; J++) { 207 // We are storing LinearCallStackId in CallSiteIds, which is a vector of 208 // CallStackId. Assert that CallStackId is no smaller than 209 // LinearCallStackId. 210 static_assert(sizeof(LinearCallStackId) <= sizeof(CallStackId)); 211 LinearCallStackId CSId = 212 endian::readNext<LinearCallStackId, llvm::endianness::little>(Ptr); 213 Record.CallSiteIds.push_back(CSId); 214 } 215 216 return Record; 217 } 218 219 IndexedMemProfRecord 220 IndexedMemProfRecord::deserialize(const MemProfSchema &Schema, 221 const unsigned char *Ptr, 222 IndexedVersion Version) { 223 switch (Version) { 224 case Version2: 225 return deserializeV2(Schema, Ptr); 226 case Version3: 227 return deserializeV3(Schema, Ptr); 228 } 229 llvm_unreachable("unsupported MemProf version"); 230 } 231 232 MemProfRecord IndexedMemProfRecord::toMemProfRecord( 233 llvm::function_ref<std::vector<Frame>(const CallStackId)> Callback) const { 234 MemProfRecord Record; 235 236 Record.AllocSites.reserve(AllocSites.size()); 237 for (const IndexedAllocationInfo &IndexedAI : AllocSites) { 238 AllocationInfo AI; 239 AI.Info = IndexedAI.Info; 240 AI.CallStack = Callback(IndexedAI.CSId); 241 Record.AllocSites.push_back(std::move(AI)); 242 } 243 244 Record.CallSites.reserve(CallSiteIds.size()); 245 for (CallStackId CSId : CallSiteIds) 246 Record.CallSites.push_back(Callback(CSId)); 247 248 return Record; 249 } 250 251 GlobalValue::GUID IndexedMemProfRecord::getGUID(const StringRef FunctionName) { 252 // Canonicalize the function name to drop suffixes such as ".llvm.". Note 253 // we do not drop any ".__uniq." suffixes, as getCanonicalFnName does not drop 254 // those by default. This is by design to differentiate internal linkage 255 // functions during matching. By dropping the other suffixes we can then match 256 // functions in the profile use phase prior to their addition. Note that this 257 // applies to both instrumented and sampled function names. 258 StringRef CanonicalName = 259 sampleprof::FunctionSamples::getCanonicalFnName(FunctionName); 260 261 // We use the function guid which we expect to be a uint64_t. At 262 // this time, it is the lower 64 bits of the md5 of the canonical 263 // function name. 264 return Function::getGUID(CanonicalName); 265 } 266 267 Expected<MemProfSchema> readMemProfSchema(const unsigned char *&Buffer) { 268 using namespace support; 269 270 const unsigned char *Ptr = Buffer; 271 const uint64_t NumSchemaIds = 272 endian::readNext<uint64_t, llvm::endianness::little>(Ptr); 273 if (NumSchemaIds > static_cast<uint64_t>(Meta::Size)) { 274 return make_error<InstrProfError>(instrprof_error::malformed, 275 "memprof schema invalid"); 276 } 277 278 MemProfSchema Result; 279 for (size_t I = 0; I < NumSchemaIds; I++) { 280 const uint64_t Tag = 281 endian::readNext<uint64_t, llvm::endianness::little>(Ptr); 282 if (Tag >= static_cast<uint64_t>(Meta::Size)) { 283 return make_error<InstrProfError>(instrprof_error::malformed, 284 "memprof schema invalid"); 285 } 286 Result.push_back(static_cast<Meta>(Tag)); 287 } 288 // Advance the buffer to one past the schema if we succeeded. 289 Buffer = Ptr; 290 return Result; 291 } 292 293 CallStackId IndexedMemProfData::hashCallStack(ArrayRef<FrameId> CS) const { 294 llvm::HashBuilder<llvm::TruncatedBLAKE3<8>, llvm::endianness::little> 295 HashBuilder; 296 for (FrameId F : CS) 297 HashBuilder.add(F); 298 llvm::BLAKE3Result<8> Hash = HashBuilder.final(); 299 CallStackId CSId; 300 std::memcpy(&CSId, Hash.data(), sizeof(Hash)); 301 return CSId; 302 } 303 304 // Encode a call stack into RadixArray. Return the starting index within 305 // RadixArray. For each call stack we encode, we emit two or three components 306 // into RadixArray. If a given call stack doesn't have a common prefix relative 307 // to the previous one, we emit: 308 // 309 // - the frames in the given call stack in the root-to-leaf order 310 // 311 // - the length of the given call stack 312 // 313 // If a given call stack has a non-empty common prefix relative to the previous 314 // one, we emit: 315 // 316 // - the relative location of the common prefix, encoded as a negative number. 317 // 318 // - a portion of the given call stack that's beyond the common prefix 319 // 320 // - the length of the given call stack, including the length of the common 321 // prefix. 322 // 323 // The resulting RadixArray requires a somewhat unintuitive backward traversal 324 // to reconstruct a call stack -- read the call stack length and scan backward 325 // while collecting frames in the leaf to root order. build, the caller of this 326 // function, reverses RadixArray in place so that we can reconstruct a call 327 // stack as if we were deserializing an array in a typical way -- the call stack 328 // length followed by the frames in the leaf-to-root order except that we need 329 // to handle pointers to parents along the way. 330 // 331 // To quickly determine the location of the common prefix within RadixArray, 332 // Indexes caches the indexes of the previous call stack's frames within 333 // RadixArray. 334 template <typename FrameIdTy> 335 LinearCallStackId CallStackRadixTreeBuilder<FrameIdTy>::encodeCallStack( 336 const llvm::SmallVector<FrameIdTy> *CallStack, 337 const llvm::SmallVector<FrameIdTy> *Prev, 338 const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes) { 339 // Compute the length of the common root prefix between Prev and CallStack. 340 uint32_t CommonLen = 0; 341 if (Prev) { 342 auto Pos = std::mismatch(Prev->rbegin(), Prev->rend(), CallStack->rbegin(), 343 CallStack->rend()); 344 CommonLen = std::distance(CallStack->rbegin(), Pos.second); 345 } 346 347 // Drop the portion beyond CommonLen. 348 assert(CommonLen <= Indexes.size()); 349 Indexes.resize(CommonLen); 350 351 // Append a pointer to the parent. 352 if (CommonLen) { 353 uint32_t CurrentIndex = RadixArray.size(); 354 uint32_t ParentIndex = Indexes.back(); 355 // The offset to the parent must be negative because we are pointing to an 356 // element we've already added to RadixArray. 357 assert(ParentIndex < CurrentIndex); 358 RadixArray.push_back(ParentIndex - CurrentIndex); 359 } 360 361 // Copy the part of the call stack beyond the common prefix to RadixArray. 362 assert(CommonLen <= CallStack->size()); 363 for (FrameIdTy F : llvm::drop_begin(llvm::reverse(*CallStack), CommonLen)) { 364 // Remember the index of F in RadixArray. 365 Indexes.push_back(RadixArray.size()); 366 RadixArray.push_back( 367 MemProfFrameIndexes ? MemProfFrameIndexes->find(F)->second : F); 368 } 369 assert(CallStack->size() == Indexes.size()); 370 371 // End with the call stack length. 372 RadixArray.push_back(CallStack->size()); 373 374 // Return the index within RadixArray where we can start reconstructing a 375 // given call stack from. 376 return RadixArray.size() - 1; 377 } 378 379 template <typename FrameIdTy> 380 void CallStackRadixTreeBuilder<FrameIdTy>::build( 381 llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>> 382 &&MemProfCallStackData, 383 const llvm::DenseMap<FrameIdTy, LinearFrameId> *MemProfFrameIndexes, 384 llvm::DenseMap<FrameIdTy, FrameStat> &FrameHistogram) { 385 // Take the vector portion of MemProfCallStackData. The vector is exactly 386 // what we need to sort. Also, we no longer need its lookup capability. 387 llvm::SmallVector<CSIdPair, 0> CallStacks = MemProfCallStackData.takeVector(); 388 389 // Return early if we have no work to do. 390 if (CallStacks.empty()) { 391 RadixArray.clear(); 392 CallStackPos.clear(); 393 return; 394 } 395 396 // Sorting the list of call stacks in the dictionary order is sufficient to 397 // maximize the length of the common prefix between two adjacent call stacks 398 // and thus minimize the length of RadixArray. However, we go one step 399 // further and try to reduce the number of times we follow pointers to parents 400 // during deserilization. Consider a poorly encoded radix tree: 401 // 402 // CallStackId 1: f1 -> f2 -> f3 403 // | 404 // CallStackId 2: +--- f4 -> f5 405 // | 406 // CallStackId 3: +--> f6 407 // 408 // Here, f2 and f4 appear once and twice, respectively, in the call stacks. 409 // Once we encode CallStackId 1 into RadixArray, every other call stack with 410 // common prefix f1 ends up pointing to CallStackId 1. Since CallStackId 3 411 // share "f1 f4" with CallStackId 2, CallStackId 3 needs to follow pointers to 412 // parents twice. 413 // 414 // We try to alleviate the situation by sorting the list of call stacks by 415 // comparing the popularity of frames rather than the integer values of 416 // FrameIds. In the example above, f4 is more popular than f2, so we sort the 417 // call stacks and encode them as: 418 // 419 // CallStackId 2: f1 -- f4 -> f5 420 // | | 421 // CallStackId 3: | +--> f6 422 // | 423 // CallStackId 1: +--> f2 -> f3 424 // 425 // Notice that CallStackId 3 follows a pointer to a parent only once. 426 // 427 // All this is a quick-n-dirty trick to reduce the number of jumps. The 428 // proper way would be to compute the weight of each radix tree node -- how 429 // many call stacks use a given radix tree node, and encode a radix tree from 430 // the heaviest node first. We do not do so because that's a lot of work. 431 llvm::sort(CallStacks, [&](const CSIdPair &L, const CSIdPair &R) { 432 // Call stacks are stored from leaf to root. Perform comparisons from the 433 // root. 434 return std::lexicographical_compare( 435 L.second.rbegin(), L.second.rend(), R.second.rbegin(), R.second.rend(), 436 [&](FrameIdTy F1, FrameIdTy F2) { 437 uint64_t H1 = FrameHistogram[F1].Count; 438 uint64_t H2 = FrameHistogram[F2].Count; 439 // Popular frames should come later because we encode call stacks from 440 // the last one in the list. 441 if (H1 != H2) 442 return H1 < H2; 443 // For sort stability. 444 return F1 < F2; 445 }); 446 }); 447 448 // Reserve some reasonable amount of storage. 449 RadixArray.clear(); 450 RadixArray.reserve(CallStacks.size() * 8); 451 452 // Indexes will grow as long as the longest call stack. 453 Indexes.clear(); 454 Indexes.reserve(512); 455 456 // CallStackPos will grow to exactly CallStacks.size() entries. 457 CallStackPos.clear(); 458 CallStackPos.reserve(CallStacks.size()); 459 460 // Compute the radix array. We encode one call stack at a time, computing the 461 // longest prefix that's shared with the previous call stack we encode. For 462 // each call stack we encode, we remember a mapping from CallStackId to its 463 // position within RadixArray. 464 // 465 // As an optimization, we encode from the last call stack in CallStacks to 466 // reduce the number of times we follow pointers to the parents. Consider the 467 // list of call stacks that has been sorted in the dictionary order: 468 // 469 // Call Stack 1: F1 470 // Call Stack 2: F1 -> F2 471 // Call Stack 3: F1 -> F2 -> F3 472 // 473 // If we traversed CallStacks in the forward order, we would end up with a 474 // radix tree like: 475 // 476 // Call Stack 1: F1 477 // | 478 // Call Stack 2: +---> F2 479 // | 480 // Call Stack 3: +---> F3 481 // 482 // Notice that each call stack jumps to the previous one. However, if we 483 // traverse CallStacks in the reverse order, then Call Stack 3 has the 484 // complete call stack encoded without any pointers. Call Stack 1 and 2 point 485 // to appropriate prefixes of Call Stack 3. 486 const llvm::SmallVector<FrameIdTy> *Prev = nullptr; 487 for (const auto &[CSId, CallStack] : llvm::reverse(CallStacks)) { 488 LinearCallStackId Pos = 489 encodeCallStack(&CallStack, Prev, MemProfFrameIndexes); 490 CallStackPos.insert({CSId, Pos}); 491 Prev = &CallStack; 492 } 493 494 // "RadixArray.size() - 1" below is problematic if RadixArray is empty. 495 assert(!RadixArray.empty()); 496 497 // Reverse the radix array in place. We do so mostly for intuitive 498 // deserialization where we would read the length field and then the call 499 // stack frames proper just like any other array deserialization, except 500 // that we have occasional jumps to take advantage of prefixes. 501 for (size_t I = 0, J = RadixArray.size() - 1; I < J; ++I, --J) 502 std::swap(RadixArray[I], RadixArray[J]); 503 504 // "Reverse" the indexes stored in CallStackPos. 505 for (auto &[K, V] : CallStackPos) 506 V = RadixArray.size() - 1 - V; 507 } 508 509 // Explicitly instantiate class with the utilized FrameIdTy. 510 template class CallStackRadixTreeBuilder<FrameId>; 511 template class CallStackRadixTreeBuilder<LinearFrameId>; 512 513 template <typename FrameIdTy> 514 llvm::DenseMap<FrameIdTy, FrameStat> 515 computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameIdTy>> 516 &MemProfCallStackData) { 517 llvm::DenseMap<FrameIdTy, FrameStat> Histogram; 518 519 for (const auto &KV : MemProfCallStackData) { 520 const auto &CS = KV.second; 521 for (unsigned I = 0, E = CS.size(); I != E; ++I) { 522 auto &S = Histogram[CS[I]]; 523 ++S.Count; 524 S.PositionSum += I; 525 } 526 } 527 return Histogram; 528 } 529 530 // Explicitly instantiate function with the utilized FrameIdTy. 531 template llvm::DenseMap<FrameId, FrameStat> computeFrameHistogram<FrameId>( 532 llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>> 533 &MemProfCallStackData); 534 template llvm::DenseMap<LinearFrameId, FrameStat> 535 computeFrameHistogram<LinearFrameId>( 536 llvm::MapVector<CallStackId, llvm::SmallVector<LinearFrameId>> 537 &MemProfCallStackData); 538 } // namespace memprof 539 } // namespace llvm 540