1 //==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements support for context disambiguation of allocation 10 // calls for profile guided heap optimization. Specifically, it uses Memprof 11 // profiles which indicate context specific allocation behavior (currently 12 // distinguishing cold vs hot memory allocations). Cloning is performed to 13 // expose the cold allocation call contexts, and the allocation calls are 14 // subsequently annotated with an attribute for later transformation. 15 // 16 // The transformations can be performed either directly on IR (regular LTO), or 17 // on a ThinLTO index (and later applied to the IR during the ThinLTO backend). 18 // Both types of LTO operate on a the same base graph representation, which 19 // uses CRTP to support either IR or Index formats. 20 // 21 //===----------------------------------------------------------------------===// 22 23 #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h" 24 #include "llvm/ADT/DenseMap.h" 25 #include "llvm/ADT/DenseSet.h" 26 #include "llvm/ADT/MapVector.h" 27 #include "llvm/ADT/SetOperations.h" 28 #include "llvm/ADT/SmallPtrSet.h" 29 #include "llvm/ADT/SmallSet.h" 30 #include "llvm/ADT/SmallVector.h" 31 #include "llvm/ADT/Statistic.h" 32 #include "llvm/Analysis/MemoryProfileInfo.h" 33 #include "llvm/Analysis/ModuleSummaryAnalysis.h" 34 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 35 #include "llvm/Bitcode/BitcodeReader.h" 36 #include "llvm/IR/Instructions.h" 37 #include "llvm/IR/Module.h" 38 #include "llvm/IR/ModuleSummaryIndex.h" 39 #include "llvm/Pass.h" 40 #include "llvm/Support/CommandLine.h" 41 #include "llvm/Support/GraphWriter.h" 42 #include "llvm/Support/raw_ostream.h" 43 #include "llvm/Transforms/IPO.h" 44 #include "llvm/Transforms/Utils/CallPromotionUtils.h" 45 #include "llvm/Transforms/Utils/Cloning.h" 46 #include "llvm/Transforms/Utils/Instrumentation.h" 47 #include <deque> 48 #include <sstream> 49 #include <unordered_map> 50 #include <vector> 51 using namespace llvm; 52 using namespace llvm::memprof; 53 54 #define DEBUG_TYPE "memprof-context-disambiguation" 55 56 STATISTIC(FunctionClonesAnalysis, 57 "Number of function clones created during whole program analysis"); 58 STATISTIC(FunctionClonesThinBackend, 59 "Number of function clones created during ThinLTO backend"); 60 STATISTIC(FunctionsClonedThinBackend, 61 "Number of functions that had clones created during ThinLTO backend"); 62 STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly " 63 "cloned) during whole program analysis"); 64 STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) " 65 "during whole program analysis"); 66 STATISTIC(AllocTypeNotColdThinBackend, 67 "Number of not cold static allocations (possibly cloned) during " 68 "ThinLTO backend"); 69 STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations " 70 "(possibly cloned) during ThinLTO backend"); 71 STATISTIC(OrigAllocsThinBackend, 72 "Number of original (not cloned) allocations with memprof profiles " 73 "during ThinLTO backend"); 74 STATISTIC( 75 AllocVersionsThinBackend, 76 "Number of allocation versions (including clones) during ThinLTO backend"); 77 STATISTIC(MaxAllocVersionsThinBackend, 78 "Maximum number of allocation versions created for an original " 79 "allocation during ThinLTO backend"); 80 STATISTIC(UnclonableAllocsThinBackend, 81 "Number of unclonable ambigous allocations during ThinLTO backend"); 82 STATISTIC(RemovedEdgesWithMismatchedCallees, 83 "Number of edges removed due to mismatched callees (profiled vs IR)"); 84 STATISTIC(FoundProfiledCalleeCount, 85 "Number of profiled callees found via tail calls"); 86 STATISTIC(FoundProfiledCalleeDepth, 87 "Aggregate depth of profiled callees found via tail calls"); 88 STATISTIC(FoundProfiledCalleeMaxDepth, 89 "Maximum depth of profiled callees found via tail calls"); 90 STATISTIC(FoundProfiledCalleeNonUniquelyCount, 91 "Number of profiled callees found via multiple tail call chains"); 92 93 static cl::opt<std::string> DotFilePathPrefix( 94 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, 95 cl::value_desc("filename"), 96 cl::desc("Specify the path prefix of the MemProf dot files.")); 97 98 static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false), 99 cl::Hidden, 100 cl::desc("Export graph to dot files.")); 101 102 static cl::opt<bool> 103 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden, 104 cl::desc("Dump CallingContextGraph to stdout after each stage.")); 105 106 static cl::opt<bool> 107 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden, 108 cl::desc("Perform verification checks on CallingContextGraph.")); 109 110 static cl::opt<bool> 111 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, 112 cl::desc("Perform frequent verification checks on nodes.")); 113 114 static cl::opt<std::string> MemProfImportSummary( 115 "memprof-import-summary", 116 cl::desc("Import summary to use for testing the ThinLTO backend via opt"), 117 cl::Hidden); 118 119 static cl::opt<unsigned> 120 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5), 121 cl::Hidden, 122 cl::desc("Max depth to recursively search for missing " 123 "frames through tail calls.")); 124 125 // Optionally enable cloning of callsites involved with recursive cycles 126 static cl::opt<bool> AllowRecursiveCallsites( 127 "memprof-allow-recursive-callsites", cl::init(false), cl::Hidden, 128 cl::desc("Allow cloning of callsites involved in recursive cycles")); 129 130 // When disabled, try to detect and prevent cloning of recursive contexts. 131 // This is only necessary until we support cloning through recursive cycles. 132 // Leave on by default for now, as disabling requires a little bit of compile 133 // time overhead and doesn't affect correctness, it will just inflate the cold 134 // hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled. 135 static cl::opt<bool> AllowRecursiveContexts( 136 "memprof-allow-recursive-contexts", cl::init(true), cl::Hidden, 137 cl::desc("Allow cloning of contexts through recursive cycles")); 138 139 namespace llvm { 140 cl::opt<bool> EnableMemProfContextDisambiguation( 141 "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden, 142 cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation")); 143 144 // Indicate we are linking with an allocator that supports hot/cold operator 145 // new interfaces. 146 cl::opt<bool> SupportsHotColdNew( 147 "supports-hot-cold-new", cl::init(false), cl::Hidden, 148 cl::desc("Linking with hot/cold operator new interfaces")); 149 150 cl::opt<bool> MemProfRequireDefinitionForPromotion( 151 "memprof-require-definition-for-promotion", cl::init(false), cl::Hidden, 152 cl::desc( 153 "Require target function definition when promoting indirect calls")); 154 } // namespace llvm 155 156 extern cl::opt<bool> MemProfReportHintedSizes; 157 extern cl::opt<unsigned> MinClonedColdBytePercent; 158 159 namespace { 160 /// CRTP base for graphs built from either IR or ThinLTO summary index. 161 /// 162 /// The graph represents the call contexts in all memprof metadata on allocation 163 /// calls, with nodes for the allocations themselves, as well as for the calls 164 /// in each context. The graph is initially built from the allocation memprof 165 /// metadata (or summary) MIBs. It is then updated to match calls with callsite 166 /// metadata onto the nodes, updating it to reflect any inlining performed on 167 /// those calls. 168 /// 169 /// Each MIB (representing an allocation's call context with allocation 170 /// behavior) is assigned a unique context id during the graph build. The edges 171 /// and nodes in the graph are decorated with the context ids they carry. This 172 /// is used to correctly update the graph when cloning is performed so that we 173 /// can uniquify the context for a single (possibly cloned) allocation. 174 template <typename DerivedCCG, typename FuncTy, typename CallTy> 175 class CallsiteContextGraph { 176 public: 177 CallsiteContextGraph() = default; 178 CallsiteContextGraph(const CallsiteContextGraph &) = default; 179 CallsiteContextGraph(CallsiteContextGraph &&) = default; 180 181 /// Main entry point to perform analysis and transformations on graph. 182 bool process(); 183 184 /// Perform cloning on the graph necessary to uniquely identify the allocation 185 /// behavior of an allocation based on its context. 186 void identifyClones(); 187 188 /// Assign callsite clones to functions, cloning functions as needed to 189 /// accommodate the combinations of their callsite clones reached by callers. 190 /// For regular LTO this clones functions and callsites in the IR, but for 191 /// ThinLTO the cloning decisions are noted in the summaries and later applied 192 /// in applyImport. 193 bool assignFunctions(); 194 195 void dump() const; 196 void print(raw_ostream &OS) const; 197 void printTotalSizes(raw_ostream &OS) const; 198 199 friend raw_ostream &operator<<(raw_ostream &OS, 200 const CallsiteContextGraph &CCG) { 201 CCG.print(OS); 202 return OS; 203 } 204 205 friend struct GraphTraits< 206 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>; 207 friend struct DOTGraphTraits< 208 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>; 209 210 void exportToDot(std::string Label) const; 211 212 /// Represents a function clone via FuncTy pointer and clone number pair. 213 struct FuncInfo final 214 : public std::pair<FuncTy *, unsigned /*Clone number*/> { 215 using Base = std::pair<FuncTy *, unsigned>; 216 FuncInfo(const Base &B) : Base(B) {} 217 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {} 218 explicit operator bool() const { return this->first != nullptr; } 219 FuncTy *func() const { return this->first; } 220 unsigned cloneNo() const { return this->second; } 221 }; 222 223 /// Represents a callsite clone via CallTy and clone number pair. 224 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> { 225 using Base = std::pair<CallTy, unsigned>; 226 CallInfo(const Base &B) : Base(B) {} 227 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0) 228 : Base(Call, CloneNo) {} 229 explicit operator bool() const { return (bool)this->first; } 230 CallTy call() const { return this->first; } 231 unsigned cloneNo() const { return this->second; } 232 void setCloneNo(unsigned N) { this->second = N; } 233 void print(raw_ostream &OS) const { 234 if (!operator bool()) { 235 assert(!cloneNo()); 236 OS << "null Call"; 237 return; 238 } 239 call()->print(OS); 240 OS << "\t(clone " << cloneNo() << ")"; 241 } 242 void dump() const { 243 print(dbgs()); 244 dbgs() << "\n"; 245 } 246 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) { 247 Call.print(OS); 248 return OS; 249 } 250 }; 251 252 struct ContextEdge; 253 254 /// Node in the Callsite Context Graph 255 struct ContextNode { 256 // Keep this for now since in the IR case where we have an Instruction* it 257 // is not as immediately discoverable. Used for printing richer information 258 // when dumping graph. 259 bool IsAllocation; 260 261 // Keeps track of when the Call was reset to null because there was 262 // recursion. 263 bool Recursive = false; 264 265 // This will be formed by ORing together the AllocationType enum values 266 // for contexts including this node. 267 uint8_t AllocTypes = 0; 268 269 // The corresponding allocation or interior call. This is the primary call 270 // for which we have created this node. 271 CallInfo Call; 272 273 // List of other calls that can be treated the same as the primary call 274 // through cloning. I.e. located in the same function and have the same 275 // (possibly pruned) stack ids. They will be updated the same way as the 276 // primary call when assigning to function clones. 277 SmallVector<CallInfo, 0> MatchingCalls; 278 279 // For alloc nodes this is a unique id assigned when constructed, and for 280 // callsite stack nodes it is the original stack id when the node is 281 // constructed from the memprof MIB metadata on the alloc nodes. Note that 282 // this is only used when matching callsite metadata onto the stack nodes 283 // created when processing the allocation memprof MIBs, and for labeling 284 // nodes in the dot graph. Therefore we don't bother to assign a value for 285 // clones. 286 uint64_t OrigStackOrAllocId = 0; 287 288 // Edges to all callees in the profiled call stacks. 289 // TODO: Should this be a map (from Callee node) for more efficient lookup? 290 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges; 291 292 // Edges to all callers in the profiled call stacks. 293 // TODO: Should this be a map (from Caller node) for more efficient lookup? 294 std::vector<std::shared_ptr<ContextEdge>> CallerEdges; 295 296 // Get the list of edges from which we can compute allocation information 297 // such as the context ids and allocation type of this node. 298 const std::vector<std::shared_ptr<ContextEdge>> * 299 getEdgesWithAllocInfo() const { 300 // If node has any callees, compute from those, otherwise compute from 301 // callers (i.e. if this is the leaf allocation node). 302 if (!CalleeEdges.empty()) 303 return &CalleeEdges; 304 if (!CallerEdges.empty()) { 305 // A node with caller edges but no callee edges must be the allocation 306 // node. 307 assert(IsAllocation); 308 return &CallerEdges; 309 } 310 return nullptr; 311 } 312 313 // Compute the context ids for this node from the union of its edge context 314 // ids. 315 DenseSet<uint32_t> getContextIds() const { 316 DenseSet<uint32_t> ContextIds; 317 auto *Edges = getEdgesWithAllocInfo(); 318 if (!Edges) 319 return {}; 320 unsigned Count = 0; 321 for (auto &Edge : *Edges) 322 Count += Edge->getContextIds().size(); 323 ContextIds.reserve(Count); 324 for (auto &Edge : *Edges) 325 ContextIds.insert(Edge->getContextIds().begin(), 326 Edge->getContextIds().end()); 327 return ContextIds; 328 } 329 330 // Compute the allocation type for this node from the OR of its edge 331 // allocation types. 332 uint8_t computeAllocType() const { 333 auto *Edges = getEdgesWithAllocInfo(); 334 if (!Edges) 335 return (uint8_t)AllocationType::None; 336 uint8_t BothTypes = 337 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; 338 uint8_t AllocType = (uint8_t)AllocationType::None; 339 for (auto &Edge : *Edges) { 340 AllocType |= Edge->AllocTypes; 341 // Bail early if alloc type reached both, no further refinement. 342 if (AllocType == BothTypes) 343 return AllocType; 344 } 345 return AllocType; 346 } 347 348 // The context ids set for this node is empty if its edge context ids are 349 // also all empty. 350 bool emptyContextIds() const { 351 auto *Edges = getEdgesWithAllocInfo(); 352 if (!Edges) 353 return true; 354 for (auto &Edge : *Edges) { 355 if (!Edge->getContextIds().empty()) 356 return false; 357 } 358 return true; 359 } 360 361 // List of clones of this ContextNode, initially empty. 362 std::vector<ContextNode *> Clones; 363 364 // If a clone, points to the original uncloned node. 365 ContextNode *CloneOf = nullptr; 366 367 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {} 368 369 ContextNode(bool IsAllocation, CallInfo C) 370 : IsAllocation(IsAllocation), Call(C) {} 371 372 void addClone(ContextNode *Clone) { 373 if (CloneOf) { 374 CloneOf->Clones.push_back(Clone); 375 Clone->CloneOf = CloneOf; 376 } else { 377 Clones.push_back(Clone); 378 assert(!Clone->CloneOf); 379 Clone->CloneOf = this; 380 } 381 } 382 383 ContextNode *getOrigNode() { 384 if (!CloneOf) 385 return this; 386 return CloneOf; 387 } 388 389 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType, 390 unsigned int ContextId); 391 392 ContextEdge *findEdgeFromCallee(const ContextNode *Callee); 393 ContextEdge *findEdgeFromCaller(const ContextNode *Caller); 394 void eraseCalleeEdge(const ContextEdge *Edge); 395 void eraseCallerEdge(const ContextEdge *Edge); 396 397 void setCall(CallInfo C) { Call = C; } 398 399 bool hasCall() const { return (bool)Call.call(); } 400 401 void printCall(raw_ostream &OS) const { Call.print(OS); } 402 403 // True if this node was effectively removed from the graph, in which case 404 // it should have an allocation type of None and empty context ids. 405 bool isRemoved() const { 406 assert((AllocTypes == (uint8_t)AllocationType::None) == 407 emptyContextIds()); 408 return AllocTypes == (uint8_t)AllocationType::None; 409 } 410 411 void dump() const; 412 void print(raw_ostream &OS) const; 413 414 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) { 415 Node.print(OS); 416 return OS; 417 } 418 }; 419 420 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or 421 /// callee. 422 struct ContextEdge { 423 ContextNode *Callee; 424 ContextNode *Caller; 425 426 // This will be formed by ORing together the AllocationType enum values 427 // for contexts including this edge. 428 uint8_t AllocTypes = 0; 429 430 // The set of IDs for contexts including this edge. 431 DenseSet<uint32_t> ContextIds; 432 433 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType, 434 DenseSet<uint32_t> ContextIds) 435 : Callee(Callee), Caller(Caller), AllocTypes(AllocType), 436 ContextIds(std::move(ContextIds)) {} 437 438 DenseSet<uint32_t> &getContextIds() { return ContextIds; } 439 440 // Helper to clear the fields of this edge when we are removing it from the 441 // graph. 442 inline void clear() { 443 ContextIds.clear(); 444 AllocTypes = (uint8_t)AllocationType::None; 445 Caller = nullptr; 446 Callee = nullptr; 447 } 448 449 // Check if edge was removed from the graph. This is useful while iterating 450 // over a copy of edge lists when performing operations that mutate the 451 // graph in ways that might remove one of the edges. 452 inline bool isRemoved() const { 453 if (Callee || Caller) 454 return false; 455 // Any edges that have been removed from the graph but are still in a 456 // shared_ptr somewhere should have all fields null'ed out by clear() 457 // above. 458 assert(AllocTypes == (uint8_t)AllocationType::None); 459 assert(ContextIds.empty()); 460 return true; 461 } 462 463 void dump() const; 464 void print(raw_ostream &OS) const; 465 466 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) { 467 Edge.print(OS); 468 return OS; 469 } 470 }; 471 472 /// Helpers to remove edges that have allocation type None (due to not 473 /// carrying any context ids) after transformations. 474 void removeNoneTypeCalleeEdges(ContextNode *Node); 475 void removeNoneTypeCallerEdges(ContextNode *Node); 476 void 477 recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node, 478 DenseSet<const ContextNode *> &Visited); 479 480 protected: 481 /// Get a list of nodes corresponding to the stack ids in the given callsite 482 /// context. 483 template <class NodeT, class IteratorT> 484 std::vector<uint64_t> 485 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext); 486 487 /// Adds nodes for the given allocation and any stack ids on its memprof MIB 488 /// metadata (or summary). 489 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F); 490 491 /// Adds nodes for the given MIB stack ids. 492 template <class NodeT, class IteratorT> 493 void addStackNodesForMIB(ContextNode *AllocNode, 494 CallStack<NodeT, IteratorT> &StackContext, 495 CallStack<NodeT, IteratorT> &CallsiteContext, 496 AllocationType AllocType, 497 ArrayRef<ContextTotalSize> ContextSizeInfo); 498 499 /// Matches all callsite metadata (or summary) to the nodes created for 500 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any 501 /// inlining performed on those callsite instructions. 502 void updateStackNodes(); 503 504 /// Update graph to conservatively handle any callsite stack nodes that target 505 /// multiple different callee target functions. 506 void handleCallsitesWithMultipleTargets(); 507 508 // Try to partition calls on the given node (already placed into the AllCalls 509 // array) by callee function, creating new copies of Node as needed to hold 510 // calls with different callees, and moving the callee edges appropriately. 511 // Returns true if partitioning was successful. 512 bool partitionCallsByCallee( 513 ContextNode *Node, ArrayRef<CallInfo> AllCalls, 514 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode); 515 516 /// Save lists of calls with MemProf metadata in each function, for faster 517 /// iteration. 518 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata; 519 520 /// Map from callsite node to the enclosing caller function. 521 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc; 522 523 private: 524 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator; 525 526 // Structure to keep track of information for each call as we are matching 527 // non-allocation callsites onto context nodes created from the allocation 528 // call metadata / summary contexts. 529 struct CallContextInfo { 530 // The callsite we're trying to match. 531 CallTy Call; 532 // The callsites stack ids that have a context node in the graph. 533 std::vector<uint64_t> StackIds; 534 // The function containing this callsite. 535 const FuncTy *Func; 536 // Initially empty, if needed this will be updated to contain the context 537 // ids for use in a new context node created for this callsite. 538 DenseSet<uint32_t> ContextIds; 539 }; 540 541 /// Helper to remove edge from graph, updating edge iterator if it is provided 542 /// (in which case CalleeIter indicates which edge list is being iterated). 543 /// This will also perform the necessary clearing of the ContextEdge members 544 /// to enable later checking if the edge has been removed (since we may have 545 /// other copies of the shared_ptr in existence, and in fact rely on this to 546 /// enable removal while iterating over a copy of a node's edge list). 547 void removeEdgeFromGraph(ContextEdge *Edge, EdgeIter *EI = nullptr, 548 bool CalleeIter = true); 549 550 /// Assigns the given Node to calls at or inlined into the location with 551 /// the Node's stack id, after post order traversing and processing its 552 /// caller nodes. Uses the call information recorded in the given 553 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences 554 /// as needed. Called by updateStackNodes which sets up the given 555 /// StackIdToMatchingCalls map. 556 void assignStackNodesPostOrder( 557 ContextNode *Node, DenseSet<const ContextNode *> &Visited, 558 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls, 559 DenseMap<CallInfo, CallInfo> &CallToMatchingCall); 560 561 /// Duplicates the given set of context ids, updating the provided 562 /// map from each original id with the newly generated context ids, 563 /// and returning the new duplicated id set. 564 DenseSet<uint32_t> duplicateContextIds( 565 const DenseSet<uint32_t> &StackSequenceContextIds, 566 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds); 567 568 /// Propagates all duplicated context ids across the graph. 569 void propagateDuplicateContextIds( 570 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds); 571 572 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true, 573 /// else to its callers. Also updates OrigNode's edges to remove any context 574 /// ids moved to the newly created edge. 575 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode, 576 bool TowardsCallee, 577 DenseSet<uint32_t> RemainingContextIds); 578 579 /// Get the stack id corresponding to the given Id or Index (for IR this will 580 /// return itself, for a summary index this will return the id recorded in the 581 /// index for that stack id index value). 582 uint64_t getStackId(uint64_t IdOrIndex) const { 583 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex); 584 } 585 586 /// Returns true if the given call targets the callee of the given edge, or if 587 /// we were able to identify the call chain through intermediate tail calls. 588 /// In the latter case new context nodes are added to the graph for the 589 /// identified tail calls, and their synthesized nodes are added to 590 /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for 591 /// the updated edges and to prepare it for an increment in the caller. 592 bool 593 calleesMatch(CallTy Call, EdgeIter &EI, 594 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap); 595 596 // Return the callee function of the given call, or nullptr if it can't be 597 // determined 598 const FuncTy *getCalleeFunc(CallTy Call) { 599 return static_cast<DerivedCCG *>(this)->getCalleeFunc(Call); 600 } 601 602 /// Returns true if the given call targets the given function, or if we were 603 /// able to identify the call chain through intermediate tail calls (in which 604 /// case FoundCalleeChain will be populated). 605 bool calleeMatchesFunc( 606 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc, 607 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) { 608 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc( 609 Call, Func, CallerFunc, FoundCalleeChain); 610 } 611 612 /// Returns true if both call instructions have the same callee. 613 bool sameCallee(CallTy Call1, CallTy Call2) { 614 return static_cast<DerivedCCG *>(this)->sameCallee(Call1, Call2); 615 } 616 617 /// Get a list of nodes corresponding to the stack ids in the given 618 /// callsite's context. 619 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) { 620 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall( 621 Call); 622 } 623 624 /// Get the last stack id in the context for callsite. 625 uint64_t getLastStackId(CallTy Call) { 626 return static_cast<DerivedCCG *>(this)->getLastStackId(Call); 627 } 628 629 /// Update the allocation call to record type of allocated memory. 630 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) { 631 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++; 632 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType); 633 } 634 635 /// Get the AllocationType assigned to the given allocation instruction clone. 636 AllocationType getAllocationCallType(const CallInfo &Call) const { 637 return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call); 638 } 639 640 /// Update non-allocation call to invoke (possibly cloned) function 641 /// CalleeFunc. 642 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) { 643 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc); 644 } 645 646 /// Clone the given function for the given callsite, recording mapping of all 647 /// of the functions tracked calls to their new versions in the CallMap. 648 /// Assigns new clones to clone number CloneNo. 649 FuncInfo cloneFunctionForCallsite( 650 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, 651 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { 652 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite( 653 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo); 654 } 655 656 /// Gets a label to use in the dot graph for the given call clone in the given 657 /// function. 658 std::string getLabel(const FuncTy *Func, const CallTy Call, 659 unsigned CloneNo) const { 660 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo); 661 } 662 663 // Create and return a new ContextNode. 664 ContextNode *createNewNode(bool IsAllocation, const FuncTy *F = nullptr, 665 CallInfo C = CallInfo()) { 666 NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C)); 667 auto *NewNode = NodeOwner.back().get(); 668 if (F) 669 NodeToCallingFunc[NewNode] = F; 670 return NewNode; 671 } 672 673 /// Helpers to find the node corresponding to the given call or stackid. 674 ContextNode *getNodeForInst(const CallInfo &C); 675 ContextNode *getNodeForAlloc(const CallInfo &C); 676 ContextNode *getNodeForStackId(uint64_t StackId); 677 678 /// Computes the alloc type corresponding to the given context ids, by 679 /// unioning their recorded alloc types. 680 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds) const; 681 682 /// Returns the allocation type of the intersection of the contexts of two 683 /// nodes (based on their provided context id sets), optimized for the case 684 /// when Node1Ids is smaller than Node2Ids. 685 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids, 686 const DenseSet<uint32_t> &Node2Ids) const; 687 688 /// Returns the allocation type of the intersection of the contexts of two 689 /// nodes (based on their provided context id sets). 690 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids, 691 const DenseSet<uint32_t> &Node2Ids) const; 692 693 /// Create a clone of Edge's callee and move Edge to that new callee node, 694 /// performing the necessary context id and allocation type updates. 695 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are 696 /// moved to an edge to the new callee. 697 ContextNode * 698 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge, 699 DenseSet<uint32_t> ContextIdsToMove = {}); 700 701 /// Change the callee of Edge to existing callee clone NewCallee, performing 702 /// the necessary context id and allocation type updates. 703 /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are 704 /// moved to an edge to the new callee. 705 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge, 706 ContextNode *NewCallee, 707 bool NewClone = false, 708 DenseSet<uint32_t> ContextIdsToMove = {}); 709 710 /// Change the caller of the edge at the given callee edge iterator to be 711 /// NewCaller, performing the necessary context id and allocation type 712 /// updates. This is similar to the above moveEdgeToExistingCalleeClone, but 713 /// a simplified version of it as we always move the given edge and all of its 714 /// context ids. 715 void moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge, 716 ContextNode *NewCaller); 717 718 /// Recursively perform cloning on the graph for the given Node and its 719 /// callers, in order to uniquely identify the allocation behavior of an 720 /// allocation given its context. The context ids of the allocation being 721 /// processed are given in AllocContextIds. 722 void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited, 723 const DenseSet<uint32_t> &AllocContextIds); 724 725 /// Map from each context ID to the AllocationType assigned to that context. 726 DenseMap<uint32_t, AllocationType> ContextIdToAllocationType; 727 728 /// Map from each contextID to the profiled full contexts and their total 729 /// sizes (there may be more than one due to context trimming), 730 /// optionally populated when requested (via MemProfReportHintedSizes or 731 /// MinClonedColdBytePercent). 732 DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos; 733 734 /// Identifies the context node created for a stack id when adding the MIB 735 /// contexts to the graph. This is used to locate the context nodes when 736 /// trying to assign the corresponding callsites with those stack ids to these 737 /// nodes. 738 DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap; 739 740 /// Maps to track the calls to their corresponding nodes in the graph. 741 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap; 742 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap; 743 744 /// Owner of all ContextNode unique_ptrs. 745 std::vector<std::unique_ptr<ContextNode>> NodeOwner; 746 747 /// Perform sanity checks on graph when requested. 748 void check() const; 749 750 /// Keeps track of the last unique context id assigned. 751 unsigned int LastContextId = 0; 752 }; 753 754 template <typename DerivedCCG, typename FuncTy, typename CallTy> 755 using ContextNode = 756 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode; 757 template <typename DerivedCCG, typename FuncTy, typename CallTy> 758 using ContextEdge = 759 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge; 760 template <typename DerivedCCG, typename FuncTy, typename CallTy> 761 using FuncInfo = 762 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo; 763 template <typename DerivedCCG, typename FuncTy, typename CallTy> 764 using CallInfo = 765 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo; 766 767 /// CRTP derived class for graphs built from IR (regular LTO). 768 class ModuleCallsiteContextGraph 769 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function, 770 Instruction *> { 771 public: 772 ModuleCallsiteContextGraph( 773 Module &M, 774 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter); 775 776 private: 777 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function, 778 Instruction *>; 779 780 uint64_t getStackId(uint64_t IdOrIndex) const; 781 const Function *getCalleeFunc(Instruction *Call); 782 bool calleeMatchesFunc( 783 Instruction *Call, const Function *Func, const Function *CallerFunc, 784 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain); 785 bool sameCallee(Instruction *Call1, Instruction *Call2); 786 bool findProfiledCalleeThroughTailCalls( 787 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth, 788 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain, 789 bool &FoundMultipleCalleeChains); 790 uint64_t getLastStackId(Instruction *Call); 791 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call); 792 void updateAllocationCall(CallInfo &Call, AllocationType AllocType); 793 AllocationType getAllocationCallType(const CallInfo &Call) const; 794 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); 795 CallsiteContextGraph<ModuleCallsiteContextGraph, Function, 796 Instruction *>::FuncInfo 797 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, 798 std::map<CallInfo, CallInfo> &CallMap, 799 std::vector<CallInfo> &CallsWithMetadataInFunc, 800 unsigned CloneNo); 801 std::string getLabel(const Function *Func, const Instruction *Call, 802 unsigned CloneNo) const; 803 804 const Module &Mod; 805 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter; 806 }; 807 808 /// Represents a call in the summary index graph, which can either be an 809 /// allocation or an interior callsite node in an allocation's context. 810 /// Holds a pointer to the corresponding data structure in the index. 811 struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> { 812 IndexCall() : PointerUnion() {} 813 IndexCall(std::nullptr_t) : IndexCall() {} 814 IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {} 815 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {} 816 IndexCall(PointerUnion PT) : PointerUnion(PT) {} 817 818 IndexCall *operator->() { return this; } 819 820 void print(raw_ostream &OS) const { 821 PointerUnion<CallsiteInfo *, AllocInfo *> Base = *this; 822 if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(Base)) { 823 OS << *AI; 824 } else { 825 auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(Base); 826 assert(CI); 827 OS << *CI; 828 } 829 } 830 }; 831 } // namespace 832 833 namespace llvm { 834 template <> struct simplify_type<IndexCall> { 835 using SimpleType = PointerUnion<CallsiteInfo *, AllocInfo *>; 836 static SimpleType getSimplifiedValue(IndexCall &Val) { return Val; } 837 }; 838 template <> struct simplify_type<const IndexCall> { 839 using SimpleType = const PointerUnion<CallsiteInfo *, AllocInfo *>; 840 static SimpleType getSimplifiedValue(const IndexCall &Val) { return Val; } 841 }; 842 } // namespace llvm 843 844 namespace { 845 /// CRTP derived class for graphs built from summary index (ThinLTO). 846 class IndexCallsiteContextGraph 847 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, 848 IndexCall> { 849 public: 850 IndexCallsiteContextGraph( 851 ModuleSummaryIndex &Index, 852 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> 853 isPrevailing); 854 855 ~IndexCallsiteContextGraph() { 856 // Now that we are done with the graph it is safe to add the new 857 // CallsiteInfo structs to the function summary vectors. The graph nodes 858 // point into locations within these vectors, so we don't want to add them 859 // any earlier. 860 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) { 861 auto *FS = I.first; 862 for (auto &Callsite : I.second) 863 FS->addCallsite(*Callsite.second); 864 } 865 } 866 867 private: 868 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, 869 IndexCall>; 870 871 uint64_t getStackId(uint64_t IdOrIndex) const; 872 const FunctionSummary *getCalleeFunc(IndexCall &Call); 873 bool calleeMatchesFunc( 874 IndexCall &Call, const FunctionSummary *Func, 875 const FunctionSummary *CallerFunc, 876 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain); 877 bool sameCallee(IndexCall &Call1, IndexCall &Call2); 878 bool findProfiledCalleeThroughTailCalls( 879 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth, 880 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain, 881 bool &FoundMultipleCalleeChains); 882 uint64_t getLastStackId(IndexCall &Call); 883 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call); 884 void updateAllocationCall(CallInfo &Call, AllocationType AllocType); 885 AllocationType getAllocationCallType(const CallInfo &Call) const; 886 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); 887 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, 888 IndexCall>::FuncInfo 889 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, 890 std::map<CallInfo, CallInfo> &CallMap, 891 std::vector<CallInfo> &CallsWithMetadataInFunc, 892 unsigned CloneNo); 893 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call, 894 unsigned CloneNo) const; 895 896 // Saves mapping from function summaries containing memprof records back to 897 // its VI, for use in checking and debugging. 898 std::map<const FunctionSummary *, ValueInfo> FSToVIMap; 899 900 const ModuleSummaryIndex &Index; 901 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> 902 isPrevailing; 903 904 // Saves/owns the callsite info structures synthesized for missing tail call 905 // frames that we discover while building the graph. 906 // It maps from the summary of the function making the tail call, to a map 907 // of callee ValueInfo to corresponding synthesized callsite info. 908 std::unordered_map<FunctionSummary *, 909 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>> 910 FunctionCalleesToSynthesizedCallsiteInfos; 911 }; 912 } // namespace 913 914 namespace llvm { 915 template <> 916 struct DenseMapInfo<typename CallsiteContextGraph< 917 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo> 918 : public DenseMapInfo<std::pair<Instruction *, unsigned>> {}; 919 template <> 920 struct DenseMapInfo<typename CallsiteContextGraph< 921 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo> 922 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {}; 923 template <> 924 struct DenseMapInfo<IndexCall> 925 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {}; 926 } // end namespace llvm 927 928 namespace { 929 930 // Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc 931 // type we should actually use on the corresponding allocation. 932 // If we can't clone a node that has NotCold+Cold alloc type, we will fall 933 // back to using NotCold. So don't bother cloning to distinguish NotCold+Cold 934 // from NotCold. 935 AllocationType allocTypeToUse(uint8_t AllocTypes) { 936 assert(AllocTypes != (uint8_t)AllocationType::None); 937 if (AllocTypes == 938 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold)) 939 return AllocationType::NotCold; 940 else 941 return (AllocationType)AllocTypes; 942 } 943 944 // Helper to check if the alloc types for all edges recorded in the 945 // InAllocTypes vector match the alloc types for all edges in the Edges 946 // vector. 947 template <typename DerivedCCG, typename FuncTy, typename CallTy> 948 bool allocTypesMatch( 949 const std::vector<uint8_t> &InAllocTypes, 950 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>> 951 &Edges) { 952 // This should be called only when the InAllocTypes vector was computed for 953 // this set of Edges. Make sure the sizes are the same. 954 assert(InAllocTypes.size() == Edges.size()); 955 return std::equal( 956 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(), 957 [](const uint8_t &l, 958 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) { 959 // Can share if one of the edges is None type - don't 960 // care about the type along that edge as it doesn't 961 // exist for those context ids. 962 if (l == (uint8_t)AllocationType::None || 963 r->AllocTypes == (uint8_t)AllocationType::None) 964 return true; 965 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes); 966 }); 967 } 968 969 // Helper to check if the alloc types for all edges recorded in the 970 // InAllocTypes vector match the alloc types for callee edges in the given 971 // clone. Because the InAllocTypes were computed from the original node's callee 972 // edges, and other cloning could have happened after this clone was created, we 973 // need to find the matching clone callee edge, which may or may not exist. 974 template <typename DerivedCCG, typename FuncTy, typename CallTy> 975 bool allocTypesMatchClone( 976 const std::vector<uint8_t> &InAllocTypes, 977 const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) { 978 const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf; 979 assert(Node); 980 // InAllocTypes should have been computed for the original node's callee 981 // edges. 982 assert(InAllocTypes.size() == Node->CalleeEdges.size()); 983 // First create a map of the clone callee edge callees to the edge alloc type. 984 DenseMap<const ContextNode<DerivedCCG, FuncTy, CallTy> *, uint8_t> 985 EdgeCalleeMap; 986 for (const auto &E : Clone->CalleeEdges) { 987 assert(!EdgeCalleeMap.contains(E->Callee)); 988 EdgeCalleeMap[E->Callee] = E->AllocTypes; 989 } 990 // Next, walk the original node's callees, and look for the corresponding 991 // clone edge to that callee. 992 for (unsigned I = 0; I < Node->CalleeEdges.size(); I++) { 993 auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee); 994 // Not found is ok, we will simply add an edge if we use this clone. 995 if (Iter == EdgeCalleeMap.end()) 996 continue; 997 // Can share if one of the edges is None type - don't 998 // care about the type along that edge as it doesn't 999 // exist for those context ids. 1000 if (InAllocTypes[I] == (uint8_t)AllocationType::None || 1001 Iter->second == (uint8_t)AllocationType::None) 1002 continue; 1003 if (allocTypeToUse(Iter->second) != allocTypeToUse(InAllocTypes[I])) 1004 return false; 1005 } 1006 return true; 1007 } 1008 1009 } // end anonymous namespace 1010 1011 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1012 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * 1013 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst( 1014 const CallInfo &C) { 1015 ContextNode *Node = getNodeForAlloc(C); 1016 if (Node) 1017 return Node; 1018 1019 return NonAllocationCallToContextNodeMap.lookup(C); 1020 } 1021 1022 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1023 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * 1024 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc( 1025 const CallInfo &C) { 1026 return AllocationCallToContextNodeMap.lookup(C); 1027 } 1028 1029 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1030 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * 1031 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId( 1032 uint64_t StackId) { 1033 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId); 1034 if (StackEntryNode != StackEntryIdToContextNodeMap.end()) 1035 return StackEntryNode->second; 1036 return nullptr; 1037 } 1038 1039 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1040 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: 1041 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType, 1042 unsigned int ContextId) { 1043 for (auto &Edge : CallerEdges) { 1044 if (Edge->Caller == Caller) { 1045 Edge->AllocTypes |= (uint8_t)AllocType; 1046 Edge->getContextIds().insert(ContextId); 1047 return; 1048 } 1049 } 1050 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>( 1051 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId})); 1052 CallerEdges.push_back(Edge); 1053 Caller->CalleeEdges.push_back(Edge); 1054 } 1055 1056 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1057 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph( 1058 ContextEdge *Edge, EdgeIter *EI, bool CalleeIter) { 1059 assert(!EI || (*EI)->get() == Edge); 1060 // Save the Caller and Callee pointers so we can erase Edge from their edge 1061 // lists after clearing Edge below. We do the clearing first in case it is 1062 // destructed after removing from the edge lists (if those were the last 1063 // shared_ptr references to Edge). 1064 auto *Callee = Edge->Callee; 1065 auto *Caller = Edge->Caller; 1066 1067 // Make sure the edge fields are cleared out so we can properly detect 1068 // removed edges if Edge is not destructed because there is still a shared_ptr 1069 // reference. 1070 Edge->clear(); 1071 1072 if (!EI) { 1073 Callee->eraseCallerEdge(Edge); 1074 Caller->eraseCalleeEdge(Edge); 1075 } else if (CalleeIter) { 1076 Callee->eraseCallerEdge(Edge); 1077 *EI = Caller->CalleeEdges.erase(*EI); 1078 } else { 1079 Caller->eraseCalleeEdge(Edge); 1080 *EI = Callee->CallerEdges.erase(*EI); 1081 } 1082 } 1083 1084 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1085 void CallsiteContextGraph< 1086 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) { 1087 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) { 1088 auto Edge = *EI; 1089 if (Edge->AllocTypes == (uint8_t)AllocationType::None) { 1090 assert(Edge->ContextIds.empty()); 1091 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true); 1092 } else 1093 ++EI; 1094 } 1095 } 1096 1097 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1098 void CallsiteContextGraph< 1099 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) { 1100 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) { 1101 auto Edge = *EI; 1102 if (Edge->AllocTypes == (uint8_t)AllocationType::None) { 1103 assert(Edge->ContextIds.empty()); 1104 Edge->Caller->eraseCalleeEdge(Edge.get()); 1105 EI = Node->CallerEdges.erase(EI); 1106 } else 1107 ++EI; 1108 } 1109 } 1110 1111 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1112 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge * 1113 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: 1114 findEdgeFromCallee(const ContextNode *Callee) { 1115 for (const auto &Edge : CalleeEdges) 1116 if (Edge->Callee == Callee) 1117 return Edge.get(); 1118 return nullptr; 1119 } 1120 1121 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1122 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge * 1123 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: 1124 findEdgeFromCaller(const ContextNode *Caller) { 1125 for (const auto &Edge : CallerEdges) 1126 if (Edge->Caller == Caller) 1127 return Edge.get(); 1128 return nullptr; 1129 } 1130 1131 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1132 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: 1133 eraseCalleeEdge(const ContextEdge *Edge) { 1134 auto EI = llvm::find_if( 1135 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) { 1136 return CalleeEdge.get() == Edge; 1137 }); 1138 assert(EI != CalleeEdges.end()); 1139 CalleeEdges.erase(EI); 1140 } 1141 1142 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1143 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: 1144 eraseCallerEdge(const ContextEdge *Edge) { 1145 auto EI = llvm::find_if( 1146 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) { 1147 return CallerEdge.get() == Edge; 1148 }); 1149 assert(EI != CallerEdges.end()); 1150 CallerEdges.erase(EI); 1151 } 1152 1153 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1154 uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType( 1155 DenseSet<uint32_t> &ContextIds) const { 1156 uint8_t BothTypes = 1157 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; 1158 uint8_t AllocType = (uint8_t)AllocationType::None; 1159 for (auto Id : ContextIds) { 1160 AllocType |= (uint8_t)ContextIdToAllocationType.at(Id); 1161 // Bail early if alloc type reached both, no further refinement. 1162 if (AllocType == BothTypes) 1163 return AllocType; 1164 } 1165 return AllocType; 1166 } 1167 1168 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1169 uint8_t 1170 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl( 1171 const DenseSet<uint32_t> &Node1Ids, 1172 const DenseSet<uint32_t> &Node2Ids) const { 1173 uint8_t BothTypes = 1174 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; 1175 uint8_t AllocType = (uint8_t)AllocationType::None; 1176 for (auto Id : Node1Ids) { 1177 if (!Node2Ids.count(Id)) 1178 continue; 1179 AllocType |= (uint8_t)ContextIdToAllocationType.at(Id); 1180 // Bail early if alloc type reached both, no further refinement. 1181 if (AllocType == BothTypes) 1182 return AllocType; 1183 } 1184 return AllocType; 1185 } 1186 1187 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1188 uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes( 1189 const DenseSet<uint32_t> &Node1Ids, 1190 const DenseSet<uint32_t> &Node2Ids) const { 1191 if (Node1Ids.size() < Node2Ids.size()) 1192 return intersectAllocTypesImpl(Node1Ids, Node2Ids); 1193 else 1194 return intersectAllocTypesImpl(Node2Ids, Node1Ids); 1195 } 1196 1197 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1198 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * 1199 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode( 1200 CallInfo Call, const FuncTy *F) { 1201 assert(!getNodeForAlloc(Call)); 1202 ContextNode *AllocNode = createNewNode(/*IsAllocation=*/true, F, Call); 1203 AllocationCallToContextNodeMap[Call] = AllocNode; 1204 // Use LastContextId as a uniq id for MIB allocation nodes. 1205 AllocNode->OrigStackOrAllocId = LastContextId; 1206 // Alloc type should be updated as we add in the MIBs. We should assert 1207 // afterwards that it is not still None. 1208 AllocNode->AllocTypes = (uint8_t)AllocationType::None; 1209 1210 return AllocNode; 1211 } 1212 1213 static std::string getAllocTypeString(uint8_t AllocTypes) { 1214 if (!AllocTypes) 1215 return "None"; 1216 std::string Str; 1217 if (AllocTypes & (uint8_t)AllocationType::NotCold) 1218 Str += "NotCold"; 1219 if (AllocTypes & (uint8_t)AllocationType::Cold) 1220 Str += "Cold"; 1221 return Str; 1222 } 1223 1224 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1225 template <class NodeT, class IteratorT> 1226 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB( 1227 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext, 1228 CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType, 1229 ArrayRef<ContextTotalSize> ContextSizeInfo) { 1230 // Treating the hot alloc type as NotCold before the disambiguation for "hot" 1231 // is done. 1232 if (AllocType == AllocationType::Hot) 1233 AllocType = AllocationType::NotCold; 1234 1235 ContextIdToAllocationType[++LastContextId] = AllocType; 1236 1237 if (!ContextSizeInfo.empty()) { 1238 auto &Entry = ContextIdToContextSizeInfos[LastContextId]; 1239 Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end()); 1240 } 1241 1242 // Update alloc type and context ids for this MIB. 1243 AllocNode->AllocTypes |= (uint8_t)AllocType; 1244 1245 // Now add or update nodes for each stack id in alloc's context. 1246 // Later when processing the stack ids on non-alloc callsites we will adjust 1247 // for any inlining in the context. 1248 ContextNode *PrevNode = AllocNode; 1249 // Look for recursion (direct recursion should have been collapsed by 1250 // module summary analysis, here we should just be detecting mutual 1251 // recursion). Mark these nodes so we don't try to clone. 1252 SmallSet<uint64_t, 8> StackIdSet; 1253 // Skip any on the allocation call (inlining). 1254 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext); 1255 ContextIter != StackContext.end(); ++ContextIter) { 1256 auto StackId = getStackId(*ContextIter); 1257 ContextNode *StackNode = getNodeForStackId(StackId); 1258 if (!StackNode) { 1259 StackNode = createNewNode(/*IsAllocation=*/false); 1260 StackEntryIdToContextNodeMap[StackId] = StackNode; 1261 StackNode->OrigStackOrAllocId = StackId; 1262 } 1263 // Marking a node recursive will prevent its cloning completely, even for 1264 // non-recursive contexts flowing through it. 1265 if (!AllowRecursiveCallsites) { 1266 auto Ins = StackIdSet.insert(StackId); 1267 if (!Ins.second) 1268 StackNode->Recursive = true; 1269 } 1270 StackNode->AllocTypes |= (uint8_t)AllocType; 1271 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId); 1272 PrevNode = StackNode; 1273 } 1274 } 1275 1276 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1277 DenseSet<uint32_t> 1278 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds( 1279 const DenseSet<uint32_t> &StackSequenceContextIds, 1280 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) { 1281 DenseSet<uint32_t> NewContextIds; 1282 for (auto OldId : StackSequenceContextIds) { 1283 NewContextIds.insert(++LastContextId); 1284 OldToNewContextIds[OldId].insert(LastContextId); 1285 assert(ContextIdToAllocationType.count(OldId)); 1286 // The new context has the same allocation type as original. 1287 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId]; 1288 } 1289 return NewContextIds; 1290 } 1291 1292 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1293 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: 1294 propagateDuplicateContextIds( 1295 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) { 1296 // Build a set of duplicated context ids corresponding to the input id set. 1297 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) { 1298 DenseSet<uint32_t> NewIds; 1299 for (auto Id : ContextIds) 1300 if (auto NewId = OldToNewContextIds.find(Id); 1301 NewId != OldToNewContextIds.end()) 1302 NewIds.insert(NewId->second.begin(), NewId->second.end()); 1303 return NewIds; 1304 }; 1305 1306 // Recursively update context ids sets along caller edges. 1307 auto UpdateCallers = [&](ContextNode *Node, 1308 DenseSet<const ContextEdge *> &Visited, 1309 auto &&UpdateCallers) -> void { 1310 for (const auto &Edge : Node->CallerEdges) { 1311 auto Inserted = Visited.insert(Edge.get()); 1312 if (!Inserted.second) 1313 continue; 1314 ContextNode *NextNode = Edge->Caller; 1315 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds()); 1316 // Only need to recursively iterate to NextNode via this caller edge if 1317 // it resulted in any added ids to NextNode. 1318 if (!NewIdsToAdd.empty()) { 1319 Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end()); 1320 UpdateCallers(NextNode, Visited, UpdateCallers); 1321 } 1322 } 1323 }; 1324 1325 DenseSet<const ContextEdge *> Visited; 1326 for (auto &Entry : AllocationCallToContextNodeMap) { 1327 auto *Node = Entry.second; 1328 UpdateCallers(Node, Visited, UpdateCallers); 1329 } 1330 } 1331 1332 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1333 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode( 1334 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee, 1335 // This must be passed by value to make a copy since it will be adjusted 1336 // as ids are moved. 1337 DenseSet<uint32_t> RemainingContextIds) { 1338 auto &OrigEdges = 1339 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges; 1340 // Increment iterator in loop so that we can remove edges as needed. 1341 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) { 1342 auto Edge = *EI; 1343 // Remove any matching context ids from Edge, return set that were found and 1344 // removed, these are the new edge's context ids. Also update the remaining 1345 // (not found ids). 1346 DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds; 1347 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds, 1348 NotFoundContextIds); 1349 RemainingContextIds.swap(NotFoundContextIds); 1350 // If no matching context ids for this edge, skip it. 1351 if (NewEdgeContextIds.empty()) { 1352 ++EI; 1353 continue; 1354 } 1355 if (TowardsCallee) { 1356 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds); 1357 auto NewEdge = std::make_shared<ContextEdge>( 1358 Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds)); 1359 NewNode->CalleeEdges.push_back(NewEdge); 1360 NewEdge->Callee->CallerEdges.push_back(NewEdge); 1361 } else { 1362 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds); 1363 auto NewEdge = std::make_shared<ContextEdge>( 1364 NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds)); 1365 NewNode->CallerEdges.push_back(NewEdge); 1366 NewEdge->Caller->CalleeEdges.push_back(NewEdge); 1367 } 1368 // Remove old edge if context ids empty. 1369 if (Edge->getContextIds().empty()) { 1370 removeEdgeFromGraph(Edge.get(), &EI, TowardsCallee); 1371 continue; 1372 } 1373 ++EI; 1374 } 1375 } 1376 1377 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1378 static void checkEdge( 1379 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) { 1380 // Confirm that alloc type is not None and that we have at least one context 1381 // id. 1382 assert(Edge->AllocTypes != (uint8_t)AllocationType::None); 1383 assert(!Edge->ContextIds.empty()); 1384 } 1385 1386 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1387 static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node, 1388 bool CheckEdges = true) { 1389 if (Node->isRemoved()) 1390 return; 1391 #ifndef NDEBUG 1392 // Compute node's context ids once for use in asserts. 1393 auto NodeContextIds = Node->getContextIds(); 1394 #endif 1395 // Node's context ids should be the union of both its callee and caller edge 1396 // context ids. 1397 if (Node->CallerEdges.size()) { 1398 DenseSet<uint32_t> CallerEdgeContextIds( 1399 Node->CallerEdges.front()->ContextIds); 1400 for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) { 1401 if (CheckEdges) 1402 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); 1403 set_union(CallerEdgeContextIds, Edge->ContextIds); 1404 } 1405 // Node can have more context ids than callers if some contexts terminate at 1406 // node and some are longer. If we are allowing recursive callsites but 1407 // haven't disabled recursive contexts, this will be violated for 1408 // incompletely cloned recursive cycles, so skip the checking in that case. 1409 assert((AllowRecursiveCallsites && AllowRecursiveContexts) || 1410 NodeContextIds == CallerEdgeContextIds || 1411 set_is_subset(CallerEdgeContextIds, NodeContextIds)); 1412 } 1413 if (Node->CalleeEdges.size()) { 1414 DenseSet<uint32_t> CalleeEdgeContextIds( 1415 Node->CalleeEdges.front()->ContextIds); 1416 for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) { 1417 if (CheckEdges) 1418 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); 1419 set_union(CalleeEdgeContextIds, Edge->getContextIds()); 1420 } 1421 assert(NodeContextIds == CalleeEdgeContextIds); 1422 } 1423 // FIXME: Since this checking is only invoked under an option, we should 1424 // change the error checking from using assert to something that will trigger 1425 // an error on a release build. 1426 #ifndef NDEBUG 1427 // Make sure we don't end up with duplicate edges between the same caller and 1428 // callee. 1429 DenseSet<ContextNode<DerivedCCG, FuncTy, CallTy> *> NodeSet; 1430 for (const auto &E : Node->CalleeEdges) 1431 NodeSet.insert(E->Callee); 1432 assert(NodeSet.size() == Node->CalleeEdges.size()); 1433 #endif 1434 } 1435 1436 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1437 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: 1438 assignStackNodesPostOrder( 1439 ContextNode *Node, DenseSet<const ContextNode *> &Visited, 1440 DenseMap<uint64_t, std::vector<CallContextInfo>> 1441 &StackIdToMatchingCalls, 1442 DenseMap<CallInfo, CallInfo> &CallToMatchingCall) { 1443 auto Inserted = Visited.insert(Node); 1444 if (!Inserted.second) 1445 return; 1446 // Post order traversal. Iterate over a copy since we may add nodes and 1447 // therefore new callers during the recursive call, invalidating any 1448 // iterator over the original edge vector. We don't need to process these 1449 // new nodes as they were already processed on creation. 1450 auto CallerEdges = Node->CallerEdges; 1451 for (auto &Edge : CallerEdges) { 1452 // Skip any that have been removed during the recursion. 1453 if (Edge->isRemoved()) { 1454 assert(!is_contained(Node->CallerEdges, Edge)); 1455 continue; 1456 } 1457 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls, 1458 CallToMatchingCall); 1459 } 1460 1461 // If this node's stack id is in the map, update the graph to contain new 1462 // nodes representing any inlining at interior callsites. Note we move the 1463 // associated context ids over to the new nodes. 1464 1465 // Ignore this node if it is for an allocation or we didn't record any 1466 // stack id lists ending at it. 1467 if (Node->IsAllocation || 1468 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId)) 1469 return; 1470 1471 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId]; 1472 // Handle the simple case first. A single call with a single stack id. 1473 // In this case there is no need to create any new context nodes, simply 1474 // assign the context node for stack id to this Call. 1475 if (Calls.size() == 1) { 1476 auto &[Call, Ids, Func, SavedContextIds] = Calls[0]; 1477 if (Ids.size() == 1) { 1478 assert(SavedContextIds.empty()); 1479 // It should be this Node 1480 assert(Node == getNodeForStackId(Ids[0])); 1481 if (Node->Recursive) 1482 return; 1483 Node->setCall(Call); 1484 NonAllocationCallToContextNodeMap[Call] = Node; 1485 NodeToCallingFunc[Node] = Func; 1486 return; 1487 } 1488 } 1489 1490 #ifndef NDEBUG 1491 // Find the node for the last stack id, which should be the same 1492 // across all calls recorded for this id, and is this node's id. 1493 uint64_t LastId = Node->OrigStackOrAllocId; 1494 ContextNode *LastNode = getNodeForStackId(LastId); 1495 // We should only have kept stack ids that had nodes. 1496 assert(LastNode); 1497 assert(LastNode == Node); 1498 #else 1499 ContextNode *LastNode = Node; 1500 #endif 1501 1502 // Compute the last node's context ids once, as it is shared by all calls in 1503 // this entry. 1504 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds(); 1505 1506 bool PrevIterCreatedNode = false; 1507 bool CreatedNode = false; 1508 for (unsigned I = 0; I < Calls.size(); 1509 I++, PrevIterCreatedNode = CreatedNode) { 1510 CreatedNode = false; 1511 auto &[Call, Ids, Func, SavedContextIds] = Calls[I]; 1512 // Skip any for which we didn't assign any ids, these don't get a node in 1513 // the graph. 1514 if (SavedContextIds.empty()) { 1515 // If this call has a matching call (located in the same function and 1516 // having the same stack ids), simply add it to the context node created 1517 // for its matching call earlier. These can be treated the same through 1518 // cloning and get updated at the same time. 1519 if (!CallToMatchingCall.contains(Call)) 1520 continue; 1521 auto MatchingCall = CallToMatchingCall[Call]; 1522 if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) { 1523 // This should only happen if we had a prior iteration, and it didn't 1524 // create a node because of the below recomputation of context ids 1525 // finding none remaining and continuing early. 1526 assert(I > 0 && !PrevIterCreatedNode); 1527 continue; 1528 } 1529 NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back( 1530 Call); 1531 continue; 1532 } 1533 1534 assert(LastId == Ids.back()); 1535 1536 // Recompute the context ids for this stack id sequence (the 1537 // intersection of the context ids of the corresponding nodes). 1538 // Start with the ids we saved in the map for this call, which could be 1539 // duplicated context ids. We have to recompute as we might have overlap 1540 // overlap between the saved context ids for different last nodes, and 1541 // removed them already during the post order traversal. 1542 set_intersect(SavedContextIds, LastNodeContextIds); 1543 ContextNode *PrevNode = LastNode; 1544 bool Skip = false; 1545 // Iterate backwards through the stack Ids, starting after the last Id 1546 // in the list, which was handled once outside for all Calls. 1547 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) { 1548 auto Id = *IdIter; 1549 ContextNode *CurNode = getNodeForStackId(Id); 1550 // We should only have kept stack ids that had nodes and weren't 1551 // recursive. 1552 assert(CurNode); 1553 assert(!CurNode->Recursive); 1554 1555 auto *Edge = CurNode->findEdgeFromCaller(PrevNode); 1556 if (!Edge) { 1557 Skip = true; 1558 break; 1559 } 1560 PrevNode = CurNode; 1561 1562 // Update the context ids, which is the intersection of the ids along 1563 // all edges in the sequence. 1564 set_intersect(SavedContextIds, Edge->getContextIds()); 1565 1566 // If we now have no context ids for clone, skip this call. 1567 if (SavedContextIds.empty()) { 1568 Skip = true; 1569 break; 1570 } 1571 } 1572 if (Skip) 1573 continue; 1574 1575 // Create new context node. 1576 ContextNode *NewNode = createNewNode(/*IsAllocation=*/false, Func, Call); 1577 NonAllocationCallToContextNodeMap[Call] = NewNode; 1578 CreatedNode = true; 1579 NewNode->AllocTypes = computeAllocType(SavedContextIds); 1580 1581 ContextNode *FirstNode = getNodeForStackId(Ids[0]); 1582 assert(FirstNode); 1583 1584 // Connect to callees of innermost stack frame in inlined call chain. 1585 // This updates context ids for FirstNode's callee's to reflect those 1586 // moved to NewNode. 1587 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds); 1588 1589 // Connect to callers of outermost stack frame in inlined call chain. 1590 // This updates context ids for FirstNode's caller's to reflect those 1591 // moved to NewNode. 1592 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds); 1593 1594 // Now we need to remove context ids from edges/nodes between First and 1595 // Last Node. 1596 PrevNode = nullptr; 1597 for (auto Id : Ids) { 1598 ContextNode *CurNode = getNodeForStackId(Id); 1599 // We should only have kept stack ids that had nodes. 1600 assert(CurNode); 1601 1602 // Remove the context ids moved to NewNode from CurNode, and the 1603 // edge from the prior node. 1604 if (PrevNode) { 1605 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode); 1606 assert(PrevEdge); 1607 set_subtract(PrevEdge->getContextIds(), SavedContextIds); 1608 if (PrevEdge->getContextIds().empty()) 1609 removeEdgeFromGraph(PrevEdge); 1610 } 1611 // Since we update the edges from leaf to tail, only look at the callee 1612 // edges. This isn't an alloc node, so if there are no callee edges, the 1613 // alloc type is None. 1614 CurNode->AllocTypes = CurNode->CalleeEdges.empty() 1615 ? (uint8_t)AllocationType::None 1616 : CurNode->computeAllocType(); 1617 PrevNode = CurNode; 1618 } 1619 if (VerifyNodes) { 1620 checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true); 1621 for (auto Id : Ids) { 1622 ContextNode *CurNode = getNodeForStackId(Id); 1623 // We should only have kept stack ids that had nodes. 1624 assert(CurNode); 1625 checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true); 1626 } 1627 } 1628 } 1629 } 1630 1631 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1632 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() { 1633 // Map of stack id to all calls with that as the last (outermost caller) 1634 // callsite id that has a context node (some might not due to pruning 1635 // performed during matching of the allocation profile contexts). 1636 // The CallContextInfo contains the Call and a list of its stack ids with 1637 // ContextNodes, the function containing Call, and the set of context ids 1638 // the analysis will eventually identify for use in any new node created 1639 // for that callsite. 1640 DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls; 1641 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) { 1642 for (auto &Call : CallsWithMetadata) { 1643 // Ignore allocations, already handled. 1644 if (AllocationCallToContextNodeMap.count(Call)) 1645 continue; 1646 auto StackIdsWithContextNodes = 1647 getStackIdsWithContextNodesForCall(Call.call()); 1648 // If there were no nodes created for MIBs on allocs (maybe this was in 1649 // the unambiguous part of the MIB stack that was pruned), ignore. 1650 if (StackIdsWithContextNodes.empty()) 1651 continue; 1652 // Otherwise, record this Call along with the list of ids for the last 1653 // (outermost caller) stack id with a node. 1654 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back( 1655 {Call.call(), StackIdsWithContextNodes, Func, {}}); 1656 } 1657 } 1658 1659 // First make a pass through all stack ids that correspond to a call, 1660 // as identified in the above loop. Compute the context ids corresponding to 1661 // each of these calls when they correspond to multiple stack ids due to 1662 // due to inlining. Perform any duplication of context ids required when 1663 // there is more than one call with the same stack ids. Their (possibly newly 1664 // duplicated) context ids are saved in the StackIdToMatchingCalls map. 1665 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds; 1666 // Save a map from each call to any that are found to match it. I.e. located 1667 // in the same function and have the same (possibly pruned) stack ids. We use 1668 // this to avoid creating extra graph nodes as they can be treated the same. 1669 DenseMap<CallInfo, CallInfo> CallToMatchingCall; 1670 for (auto &It : StackIdToMatchingCalls) { 1671 auto &Calls = It.getSecond(); 1672 // Skip single calls with a single stack id. These don't need a new node. 1673 if (Calls.size() == 1) { 1674 auto &Ids = Calls[0].StackIds; 1675 if (Ids.size() == 1) 1676 continue; 1677 } 1678 // In order to do the best and maximal matching of inlined calls to context 1679 // node sequences we will sort the vectors of stack ids in descending order 1680 // of length, and within each length, lexicographically by stack id. The 1681 // latter is so that we can specially handle calls that have identical stack 1682 // id sequences (either due to cloning or artificially because of the MIB 1683 // context pruning). Those with the same Ids are then sorted by function to 1684 // facilitate efficiently mapping them to the same context node. 1685 // Because the functions are pointers, to ensure a stable sort first assign 1686 // each function pointer to its first index in the Calls array, and then use 1687 // that to sort by. 1688 DenseMap<const FuncTy *, unsigned> FuncToIndex; 1689 for (const auto &[Idx, CallCtxInfo] : enumerate(Calls)) 1690 FuncToIndex.insert({CallCtxInfo.Func, Idx}); 1691 std::stable_sort( 1692 Calls.begin(), Calls.end(), 1693 [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) { 1694 return A.StackIds.size() > B.StackIds.size() || 1695 (A.StackIds.size() == B.StackIds.size() && 1696 (A.StackIds < B.StackIds || 1697 (A.StackIds == B.StackIds && 1698 FuncToIndex[A.Func] < FuncToIndex[B.Func]))); 1699 }); 1700 1701 // Find the node for the last stack id, which should be the same 1702 // across all calls recorded for this id, and is the id for this 1703 // entry in the StackIdToMatchingCalls map. 1704 uint64_t LastId = It.getFirst(); 1705 ContextNode *LastNode = getNodeForStackId(LastId); 1706 // We should only have kept stack ids that had nodes. 1707 assert(LastNode); 1708 1709 if (LastNode->Recursive) 1710 continue; 1711 1712 // Initialize the context ids with the last node's. We will subsequently 1713 // refine the context ids by computing the intersection along all edges. 1714 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds(); 1715 assert(!LastNodeContextIds.empty()); 1716 1717 #ifndef NDEBUG 1718 // Save the set of functions seen for a particular set of the same stack 1719 // ids. This is used to ensure that they have been correctly sorted to be 1720 // adjacent in the Calls list, since we rely on that to efficiently place 1721 // all such matching calls onto the same context node. 1722 DenseSet<const FuncTy *> MatchingIdsFuncSet; 1723 #endif 1724 1725 for (unsigned I = 0; I < Calls.size(); I++) { 1726 auto &[Call, Ids, Func, SavedContextIds] = Calls[I]; 1727 assert(SavedContextIds.empty()); 1728 assert(LastId == Ids.back()); 1729 1730 #ifndef NDEBUG 1731 // If this call has a different set of ids than the last one, clear the 1732 // set used to ensure they are sorted properly. 1733 if (I > 0 && Ids != Calls[I - 1].StackIds) 1734 MatchingIdsFuncSet.clear(); 1735 #endif 1736 1737 // First compute the context ids for this stack id sequence (the 1738 // intersection of the context ids of the corresponding nodes). 1739 // Start with the remaining saved ids for the last node. 1740 assert(!LastNodeContextIds.empty()); 1741 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds; 1742 1743 ContextNode *PrevNode = LastNode; 1744 ContextNode *CurNode = LastNode; 1745 bool Skip = false; 1746 1747 // Iterate backwards through the stack Ids, starting after the last Id 1748 // in the list, which was handled once outside for all Calls. 1749 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) { 1750 auto Id = *IdIter; 1751 CurNode = getNodeForStackId(Id); 1752 // We should only have kept stack ids that had nodes. 1753 assert(CurNode); 1754 1755 if (CurNode->Recursive) { 1756 Skip = true; 1757 break; 1758 } 1759 1760 auto *Edge = CurNode->findEdgeFromCaller(PrevNode); 1761 // If there is no edge then the nodes belong to different MIB contexts, 1762 // and we should skip this inlined context sequence. For example, this 1763 // particular inlined context may include stack ids A->B, and we may 1764 // indeed have nodes for both A and B, but it is possible that they were 1765 // never profiled in sequence in a single MIB for any allocation (i.e. 1766 // we might have profiled an allocation that involves the callsite A, 1767 // but through a different one of its callee callsites, and we might 1768 // have profiled an allocation that involves callsite B, but reached 1769 // from a different caller callsite). 1770 if (!Edge) { 1771 Skip = true; 1772 break; 1773 } 1774 PrevNode = CurNode; 1775 1776 // Update the context ids, which is the intersection of the ids along 1777 // all edges in the sequence. 1778 set_intersect(StackSequenceContextIds, Edge->getContextIds()); 1779 1780 // If we now have no context ids for clone, skip this call. 1781 if (StackSequenceContextIds.empty()) { 1782 Skip = true; 1783 break; 1784 } 1785 } 1786 if (Skip) 1787 continue; 1788 1789 // If some of this call's stack ids did not have corresponding nodes (due 1790 // to pruning), don't include any context ids for contexts that extend 1791 // beyond these nodes. Otherwise we would be matching part of unrelated / 1792 // not fully matching stack contexts. To do this, subtract any context ids 1793 // found in caller nodes of the last node found above. 1794 if (Ids.back() != getLastStackId(Call)) { 1795 for (const auto &PE : LastNode->CallerEdges) { 1796 set_subtract(StackSequenceContextIds, PE->getContextIds()); 1797 if (StackSequenceContextIds.empty()) 1798 break; 1799 } 1800 // If we now have no context ids for clone, skip this call. 1801 if (StackSequenceContextIds.empty()) 1802 continue; 1803 } 1804 1805 #ifndef NDEBUG 1806 // If the prior call had the same stack ids this set would not be empty. 1807 // Check if we already have a call that "matches" because it is located 1808 // in the same function. If the Calls list was sorted properly we should 1809 // not encounter this situation as all such entries should be adjacent 1810 // and processed in bulk further below. 1811 assert(!MatchingIdsFuncSet.contains(Func)); 1812 1813 MatchingIdsFuncSet.insert(Func); 1814 #endif 1815 1816 // Check if the next set of stack ids is the same (since the Calls vector 1817 // of tuples is sorted by the stack ids we can just look at the next one). 1818 // If so, save them in the CallToMatchingCall map so that they get 1819 // assigned to the same context node, and skip them. 1820 bool DuplicateContextIds = false; 1821 for (unsigned J = I + 1; J < Calls.size(); J++) { 1822 auto &CallCtxInfo = Calls[J]; 1823 auto &NextIds = CallCtxInfo.StackIds; 1824 if (NextIds != Ids) 1825 break; 1826 auto *NextFunc = CallCtxInfo.Func; 1827 if (NextFunc != Func) { 1828 // We have another Call with the same ids but that cannot share this 1829 // node, must duplicate ids for it. 1830 DuplicateContextIds = true; 1831 break; 1832 } 1833 auto &NextCall = CallCtxInfo.Call; 1834 CallToMatchingCall[NextCall] = Call; 1835 // Update I so that it gets incremented correctly to skip this call. 1836 I = J; 1837 } 1838 1839 // If we don't have duplicate context ids, then we can assign all the 1840 // context ids computed for the original node sequence to this call. 1841 // If there are duplicate calls with the same stack ids then we synthesize 1842 // new context ids that are duplicates of the originals. These are 1843 // assigned to SavedContextIds, which is a reference into the map entry 1844 // for this call, allowing us to access these ids later on. 1845 OldToNewContextIds.reserve(OldToNewContextIds.size() + 1846 StackSequenceContextIds.size()); 1847 SavedContextIds = 1848 DuplicateContextIds 1849 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds) 1850 : StackSequenceContextIds; 1851 assert(!SavedContextIds.empty()); 1852 1853 if (!DuplicateContextIds) { 1854 // Update saved last node's context ids to remove those that are 1855 // assigned to other calls, so that it is ready for the next call at 1856 // this stack id. 1857 set_subtract(LastNodeContextIds, StackSequenceContextIds); 1858 if (LastNodeContextIds.empty()) 1859 break; 1860 } 1861 } 1862 } 1863 1864 // Propagate the duplicate context ids over the graph. 1865 propagateDuplicateContextIds(OldToNewContextIds); 1866 1867 if (VerifyCCG) 1868 check(); 1869 1870 // Now perform a post-order traversal over the graph, starting with the 1871 // allocation nodes, essentially processing nodes from callers to callees. 1872 // For any that contains an id in the map, update the graph to contain new 1873 // nodes representing any inlining at interior callsites. Note we move the 1874 // associated context ids over to the new nodes. 1875 DenseSet<const ContextNode *> Visited; 1876 for (auto &Entry : AllocationCallToContextNodeMap) 1877 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls, 1878 CallToMatchingCall); 1879 if (VerifyCCG) 1880 check(); 1881 } 1882 1883 uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) { 1884 CallStack<MDNode, MDNode::op_iterator> CallsiteContext( 1885 Call->getMetadata(LLVMContext::MD_callsite)); 1886 return CallsiteContext.back(); 1887 } 1888 1889 uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) { 1890 assert(isa<CallsiteInfo *>(Call)); 1891 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator> 1892 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call)); 1893 // Need to convert index into stack id. 1894 return Index.getStackIdAtIndex(CallsiteContext.back()); 1895 } 1896 1897 static const std::string MemProfCloneSuffix = ".memprof."; 1898 1899 static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) { 1900 // We use CloneNo == 0 to refer to the original version, which doesn't get 1901 // renamed with a suffix. 1902 if (!CloneNo) 1903 return Base.str(); 1904 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str(); 1905 } 1906 1907 static bool isMemProfClone(const Function &F) { 1908 return F.getName().contains(MemProfCloneSuffix); 1909 } 1910 1911 std::string ModuleCallsiteContextGraph::getLabel(const Function *Func, 1912 const Instruction *Call, 1913 unsigned CloneNo) const { 1914 return (Twine(Call->getFunction()->getName()) + " -> " + 1915 cast<CallBase>(Call)->getCalledFunction()->getName()) 1916 .str(); 1917 } 1918 1919 std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func, 1920 const IndexCall &Call, 1921 unsigned CloneNo) const { 1922 auto VI = FSToVIMap.find(Func); 1923 assert(VI != FSToVIMap.end()); 1924 if (isa<AllocInfo *>(Call)) 1925 return (VI->second.name() + " -> alloc").str(); 1926 else { 1927 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call); 1928 return (VI->second.name() + " -> " + 1929 getMemProfFuncName(Callsite->Callee.name(), 1930 Callsite->Clones[CloneNo])) 1931 .str(); 1932 } 1933 } 1934 1935 std::vector<uint64_t> 1936 ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall( 1937 Instruction *Call) { 1938 CallStack<MDNode, MDNode::op_iterator> CallsiteContext( 1939 Call->getMetadata(LLVMContext::MD_callsite)); 1940 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>( 1941 CallsiteContext); 1942 } 1943 1944 std::vector<uint64_t> 1945 IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) { 1946 assert(isa<CallsiteInfo *>(Call)); 1947 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator> 1948 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call)); 1949 return getStackIdsWithContextNodes<CallsiteInfo, 1950 SmallVector<unsigned>::const_iterator>( 1951 CallsiteContext); 1952 } 1953 1954 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1955 template <class NodeT, class IteratorT> 1956 std::vector<uint64_t> 1957 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes( 1958 CallStack<NodeT, IteratorT> &CallsiteContext) { 1959 std::vector<uint64_t> StackIds; 1960 for (auto IdOrIndex : CallsiteContext) { 1961 auto StackId = getStackId(IdOrIndex); 1962 ContextNode *Node = getNodeForStackId(StackId); 1963 if (!Node) 1964 break; 1965 StackIds.push_back(StackId); 1966 } 1967 return StackIds; 1968 } 1969 1970 ModuleCallsiteContextGraph::ModuleCallsiteContextGraph( 1971 Module &M, 1972 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) 1973 : Mod(M), OREGetter(OREGetter) { 1974 for (auto &F : M) { 1975 std::vector<CallInfo> CallsWithMetadata; 1976 for (auto &BB : F) { 1977 for (auto &I : BB) { 1978 if (!isa<CallBase>(I)) 1979 continue; 1980 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) { 1981 CallsWithMetadata.push_back(&I); 1982 auto *AllocNode = addAllocNode(&I, &F); 1983 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite); 1984 assert(CallsiteMD); 1985 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD); 1986 // Add all of the MIBs and their stack nodes. 1987 for (auto &MDOp : MemProfMD->operands()) { 1988 auto *MIBMD = cast<const MDNode>(MDOp); 1989 std::vector<ContextTotalSize> ContextSizeInfo; 1990 // Collect the context size information if it exists. 1991 if (MIBMD->getNumOperands() > 2) { 1992 for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) { 1993 MDNode *ContextSizePair = 1994 dyn_cast<MDNode>(MIBMD->getOperand(I)); 1995 assert(ContextSizePair->getNumOperands() == 2); 1996 uint64_t FullStackId = mdconst::dyn_extract<ConstantInt>( 1997 ContextSizePair->getOperand(0)) 1998 ->getZExtValue(); 1999 uint64_t TotalSize = mdconst::dyn_extract<ConstantInt>( 2000 ContextSizePair->getOperand(1)) 2001 ->getZExtValue(); 2002 ContextSizeInfo.push_back({FullStackId, TotalSize}); 2003 } 2004 } 2005 MDNode *StackNode = getMIBStackNode(MIBMD); 2006 assert(StackNode); 2007 CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode); 2008 addStackNodesForMIB<MDNode, MDNode::op_iterator>( 2009 AllocNode, StackContext, CallsiteContext, 2010 getMIBAllocType(MIBMD), ContextSizeInfo); 2011 } 2012 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None); 2013 // Memprof and callsite metadata on memory allocations no longer 2014 // needed. 2015 I.setMetadata(LLVMContext::MD_memprof, nullptr); 2016 I.setMetadata(LLVMContext::MD_callsite, nullptr); 2017 } 2018 // For callsite metadata, add to list for this function for later use. 2019 else if (I.getMetadata(LLVMContext::MD_callsite)) { 2020 CallsWithMetadata.push_back(&I); 2021 } 2022 } 2023 } 2024 if (!CallsWithMetadata.empty()) 2025 FuncToCallsWithMetadata[&F] = CallsWithMetadata; 2026 } 2027 2028 if (DumpCCG) { 2029 dbgs() << "CCG before updating call stack chains:\n"; 2030 dbgs() << *this; 2031 } 2032 2033 if (ExportToDot) 2034 exportToDot("prestackupdate"); 2035 2036 updateStackNodes(); 2037 2038 handleCallsitesWithMultipleTargets(); 2039 2040 // Strip off remaining callsite metadata, no longer needed. 2041 for (auto &FuncEntry : FuncToCallsWithMetadata) 2042 for (auto &Call : FuncEntry.second) 2043 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr); 2044 } 2045 2046 IndexCallsiteContextGraph::IndexCallsiteContextGraph( 2047 ModuleSummaryIndex &Index, 2048 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> 2049 isPrevailing) 2050 : Index(Index), isPrevailing(isPrevailing) { 2051 for (auto &I : Index) { 2052 auto VI = Index.getValueInfo(I); 2053 for (auto &S : VI.getSummaryList()) { 2054 // We should only add the prevailing nodes. Otherwise we may try to clone 2055 // in a weak copy that won't be linked (and may be different than the 2056 // prevailing version). 2057 // We only keep the memprof summary on the prevailing copy now when 2058 // building the combined index, as a space optimization, however don't 2059 // rely on this optimization. The linker doesn't resolve local linkage 2060 // values so don't check whether those are prevailing. 2061 if (!GlobalValue::isLocalLinkage(S->linkage()) && 2062 !isPrevailing(VI.getGUID(), S.get())) 2063 continue; 2064 auto *FS = dyn_cast<FunctionSummary>(S.get()); 2065 if (!FS) 2066 continue; 2067 std::vector<CallInfo> CallsWithMetadata; 2068 if (!FS->allocs().empty()) { 2069 for (auto &AN : FS->mutableAllocs()) { 2070 // This can happen because of recursion elimination handling that 2071 // currently exists in ModuleSummaryAnalysis. Skip these for now. 2072 // We still added them to the summary because we need to be able to 2073 // correlate properly in applyImport in the backends. 2074 if (AN.MIBs.empty()) 2075 continue; 2076 IndexCall AllocCall(&AN); 2077 CallsWithMetadata.push_back(AllocCall); 2078 auto *AllocNode = addAllocNode(AllocCall, FS); 2079 // Pass an empty CallStack to the CallsiteContext (second) 2080 // parameter, since for ThinLTO we already collapsed out the inlined 2081 // stack ids on the allocation call during ModuleSummaryAnalysis. 2082 CallStack<MIBInfo, SmallVector<unsigned>::const_iterator> 2083 EmptyContext; 2084 unsigned I = 0; 2085 assert( 2086 (!MemProfReportHintedSizes && MinClonedColdBytePercent >= 100) || 2087 AN.ContextSizeInfos.size() == AN.MIBs.size()); 2088 // Now add all of the MIBs and their stack nodes. 2089 for (auto &MIB : AN.MIBs) { 2090 CallStack<MIBInfo, SmallVector<unsigned>::const_iterator> 2091 StackContext(&MIB); 2092 std::vector<ContextTotalSize> ContextSizeInfo; 2093 if (!AN.ContextSizeInfos.empty()) { 2094 for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I]) 2095 ContextSizeInfo.push_back({FullStackId, TotalSize}); 2096 } 2097 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>( 2098 AllocNode, StackContext, EmptyContext, MIB.AllocType, 2099 ContextSizeInfo); 2100 I++; 2101 } 2102 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None); 2103 // Initialize version 0 on the summary alloc node to the current alloc 2104 // type, unless it has both types in which case make it default, so 2105 // that in the case where we aren't able to clone the original version 2106 // always ends up with the default allocation behavior. 2107 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes); 2108 } 2109 } 2110 // For callsite metadata, add to list for this function for later use. 2111 if (!FS->callsites().empty()) 2112 for (auto &SN : FS->mutableCallsites()) { 2113 IndexCall StackNodeCall(&SN); 2114 CallsWithMetadata.push_back(StackNodeCall); 2115 } 2116 2117 if (!CallsWithMetadata.empty()) 2118 FuncToCallsWithMetadata[FS] = CallsWithMetadata; 2119 2120 if (!FS->allocs().empty() || !FS->callsites().empty()) 2121 FSToVIMap[FS] = VI; 2122 } 2123 } 2124 2125 if (DumpCCG) { 2126 dbgs() << "CCG before updating call stack chains:\n"; 2127 dbgs() << *this; 2128 } 2129 2130 if (ExportToDot) 2131 exportToDot("prestackupdate"); 2132 2133 updateStackNodes(); 2134 2135 handleCallsitesWithMultipleTargets(); 2136 } 2137 2138 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2139 void CallsiteContextGraph<DerivedCCG, FuncTy, 2140 CallTy>::handleCallsitesWithMultipleTargets() { 2141 // Look for and workaround callsites that call multiple functions. 2142 // This can happen for indirect calls, which needs better handling, and in 2143 // more rare cases (e.g. macro expansion). 2144 // TODO: To fix this for indirect calls we will want to perform speculative 2145 // devirtualization using either the normal PGO info with ICP, or using the 2146 // information in the profiled MemProf contexts. We can do this prior to 2147 // this transformation for regular LTO, and for ThinLTO we can simulate that 2148 // effect in the summary and perform the actual speculative devirtualization 2149 // while cloning in the ThinLTO backend. 2150 2151 // Keep track of the new nodes synthesized for discovered tail calls missing 2152 // from the profiled contexts. 2153 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap; 2154 2155 std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode; 2156 for (auto &Entry : NonAllocationCallToContextNodeMap) { 2157 auto *Node = Entry.second; 2158 assert(Node->Clones.empty()); 2159 // Check all node callees and see if in the same function. 2160 // We need to check all of the calls recorded in this Node, because in some 2161 // cases we may have had multiple calls with the same debug info calling 2162 // different callees. This can happen, for example, when an object is 2163 // constructed in the paramter list - the destructor call of the object has 2164 // the same debug info (line/col) as the call the object was passed to. 2165 // Here we will prune any that don't match all callee nodes. 2166 std::vector<CallInfo> AllCalls; 2167 AllCalls.reserve(Node->MatchingCalls.size() + 1); 2168 AllCalls.push_back(Node->Call); 2169 AllCalls.insert(AllCalls.end(), Node->MatchingCalls.begin(), 2170 Node->MatchingCalls.end()); 2171 2172 // First see if we can partition the calls by callee function, creating new 2173 // nodes to host each set of calls calling the same callees. This is 2174 // necessary for support indirect calls with ThinLTO, for which we 2175 // synthesized CallsiteInfo records for each target. They will all have the 2176 // same callsite stack ids and would be sharing a context node at this 2177 // point. We need to perform separate cloning for each, which will be 2178 // applied along with speculative devirtualization in the ThinLTO backends 2179 // as needed. Note this does not currently support looking through tail 2180 // calls, it is unclear if we need that for indirect call targets. 2181 // First partition calls by callee func. Map indexed by func, value is 2182 // struct with list of matching calls, assigned node. 2183 if (partitionCallsByCallee(Node, AllCalls, NewCallToNode)) 2184 continue; 2185 2186 auto It = AllCalls.begin(); 2187 // Iterate through the calls until we find the first that matches. 2188 for (; It != AllCalls.end(); ++It) { 2189 auto ThisCall = *It; 2190 bool Match = true; 2191 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end(); 2192 ++EI) { 2193 auto Edge = *EI; 2194 if (!Edge->Callee->hasCall()) 2195 continue; 2196 assert(NodeToCallingFunc.count(Edge->Callee)); 2197 // Check if the called function matches that of the callee node. 2198 if (!calleesMatch(ThisCall.call(), EI, TailCallToContextNodeMap)) { 2199 Match = false; 2200 break; 2201 } 2202 } 2203 // Found a call that matches the callee nodes, we can quit now. 2204 if (Match) { 2205 // If the first match is not the primary call on the Node, update it 2206 // now. We will update the list of matching calls further below. 2207 if (Node->Call != ThisCall) { 2208 Node->setCall(ThisCall); 2209 // We need to update the NonAllocationCallToContextNodeMap, but don't 2210 // want to do this during iteration over that map, so save the calls 2211 // that need updated entries. 2212 NewCallToNode.push_back({ThisCall, Node}); 2213 } 2214 break; 2215 } 2216 } 2217 // We will update this list below (or leave it cleared if there was no 2218 // match found above). 2219 Node->MatchingCalls.clear(); 2220 // If we hit the end of the AllCalls vector, no call matching the callee 2221 // nodes was found, clear the call information in the node. 2222 if (It == AllCalls.end()) { 2223 RemovedEdgesWithMismatchedCallees++; 2224 // Work around by setting Node to have a null call, so it gets 2225 // skipped during cloning. Otherwise assignFunctions will assert 2226 // because its data structures are not designed to handle this case. 2227 Node->setCall(CallInfo()); 2228 continue; 2229 } 2230 // Now add back any matching calls that call the same function as the 2231 // matching primary call on Node. 2232 for (++It; It != AllCalls.end(); ++It) { 2233 auto ThisCall = *It; 2234 if (!sameCallee(Node->Call.call(), ThisCall.call())) 2235 continue; 2236 Node->MatchingCalls.push_back(ThisCall); 2237 } 2238 } 2239 2240 // Remove all mismatched nodes identified in the above loop from the node map 2241 // (checking whether they have a null call which is set above). For a 2242 // MapVector like NonAllocationCallToContextNodeMap it is much more efficient 2243 // to do the removal via remove_if than by individually erasing entries above. 2244 // Also remove any entries if we updated the node's primary call above. 2245 NonAllocationCallToContextNodeMap.remove_if([](const auto &it) { 2246 return !it.second->hasCall() || it.second->Call != it.first; 2247 }); 2248 2249 // Add entries for any new primary calls recorded above. 2250 for (auto &[Call, Node] : NewCallToNode) 2251 NonAllocationCallToContextNodeMap[Call] = Node; 2252 2253 // Add the new nodes after the above loop so that the iteration is not 2254 // invalidated. 2255 for (auto &[Call, Node] : TailCallToContextNodeMap) 2256 NonAllocationCallToContextNodeMap[Call] = Node; 2257 } 2258 2259 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2260 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee( 2261 ContextNode *Node, ArrayRef<CallInfo> AllCalls, 2262 std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) { 2263 // Struct to keep track of all the calls having the same callee function, 2264 // and the node we eventually assign to them. Eventually we will record the 2265 // context node assigned to this group of calls. 2266 struct CallsWithSameCallee { 2267 std::vector<CallInfo> Calls; 2268 ContextNode *Node = nullptr; 2269 }; 2270 2271 // First partition calls by callee function. Build map from each function 2272 // to the list of matching calls. 2273 DenseMap<const FuncTy *, CallsWithSameCallee> CalleeFuncToCallInfo; 2274 for (auto ThisCall : AllCalls) { 2275 auto *F = getCalleeFunc(ThisCall.call()); 2276 if (F) 2277 CalleeFuncToCallInfo[F].Calls.push_back(ThisCall); 2278 } 2279 2280 // Next, walk through all callee edges. For each callee node, get its 2281 // containing function and see if it was recorded in the above map (meaning we 2282 // have at least one matching call). Build another map from each callee node 2283 // with a matching call to the structure instance created above containing all 2284 // the calls. 2285 DenseMap<ContextNode *, CallsWithSameCallee *> CalleeNodeToCallInfo; 2286 for (const auto &Edge : Node->CalleeEdges) { 2287 if (!Edge->Callee->hasCall()) 2288 continue; 2289 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee]; 2290 if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc)) 2291 CalleeNodeToCallInfo[Edge->Callee] = 2292 &CalleeFuncToCallInfo[ProfiledCalleeFunc]; 2293 } 2294 2295 // If there are entries in the second map, then there were no matching 2296 // calls/callees, nothing to do here. Return so we can go to the handling that 2297 // looks through tail calls. 2298 if (CalleeNodeToCallInfo.empty()) 2299 return false; 2300 2301 // Walk through all callee edges again. Any and all callee edges that didn't 2302 // match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a 2303 // new caller node (UnmatchedCalleesNode) which gets a null call so that it is 2304 // ignored during cloning. If it is in the map, then we use the node recorded 2305 // in that entry (creating it if needed), and move the callee edge to it. 2306 // The first callee will use the original node instead of creating a new one. 2307 // Note that any of the original calls on this node (in AllCalls) that didn't 2308 // have a callee function automatically get dropped from the node as part of 2309 // this process. 2310 ContextNode *UnmatchedCalleesNode = nullptr; 2311 // Track whether we already assigned original node to a callee. 2312 bool UsedOrigNode = false; 2313 assert(NodeToCallingFunc[Node]); 2314 // Iterate over a copy of Node's callee edges, since we may need to remove 2315 // edges in moveCalleeEdgeToNewCaller, and this simplifies the handling and 2316 // makes it less error-prone. 2317 auto CalleeEdges = Node->CalleeEdges; 2318 for (auto &Edge : CalleeEdges) { 2319 if (!Edge->Callee->hasCall()) 2320 continue; 2321 2322 // Will be updated below to point to whatever (caller) node this callee edge 2323 // should be moved to. 2324 ContextNode *CallerNodeToUse = nullptr; 2325 2326 // Handle the case where there were no matching calls first. Move this 2327 // callee edge to the UnmatchedCalleesNode, creating it if needed. 2328 if (!CalleeNodeToCallInfo.contains(Edge->Callee)) { 2329 if (!UnmatchedCalleesNode) 2330 UnmatchedCalleesNode = 2331 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]); 2332 CallerNodeToUse = UnmatchedCalleesNode; 2333 } else { 2334 // Look up the information recorded for this callee node, and use the 2335 // recorded caller node (creating it if needed). 2336 auto *Info = CalleeNodeToCallInfo[Edge->Callee]; 2337 if (!Info->Node) { 2338 // If we haven't assigned any callees to the original node use it. 2339 if (!UsedOrigNode) { 2340 Info->Node = Node; 2341 // Clear the set of matching calls which will be updated below. 2342 Node->MatchingCalls.clear(); 2343 UsedOrigNode = true; 2344 } else 2345 Info->Node = 2346 createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]); 2347 assert(!Info->Calls.empty()); 2348 // The first call becomes the primary call for this caller node, and the 2349 // rest go in the matching calls list. 2350 Info->Node->setCall(Info->Calls.front()); 2351 Info->Node->MatchingCalls.insert(Info->Node->MatchingCalls.end(), 2352 Info->Calls.begin() + 1, 2353 Info->Calls.end()); 2354 // Save the primary call to node correspondence so that we can update 2355 // the NonAllocationCallToContextNodeMap, which is being iterated in the 2356 // caller of this function. 2357 NewCallToNode.push_back({Info->Node->Call, Info->Node}); 2358 } 2359 CallerNodeToUse = Info->Node; 2360 } 2361 2362 // Don't need to move edge if we are using the original node; 2363 if (CallerNodeToUse == Node) 2364 continue; 2365 2366 moveCalleeEdgeToNewCaller(Edge, CallerNodeToUse); 2367 } 2368 // Now that we are done moving edges, clean up any caller edges that ended 2369 // up with no type or context ids. During moveCalleeEdgeToNewCaller all 2370 // caller edges from Node are replicated onto the new callers, and it 2371 // simplifies the handling to leave them until we have moved all 2372 // edges/context ids. 2373 for (auto &I : CalleeNodeToCallInfo) 2374 removeNoneTypeCallerEdges(I.second->Node); 2375 if (UnmatchedCalleesNode) 2376 removeNoneTypeCallerEdges(UnmatchedCalleesNode); 2377 removeNoneTypeCallerEdges(Node); 2378 2379 return true; 2380 } 2381 2382 uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const { 2383 // In the Module (IR) case this is already the Id. 2384 return IdOrIndex; 2385 } 2386 2387 uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const { 2388 // In the Index case this is an index into the stack id list in the summary 2389 // index, convert it to an Id. 2390 return Index.getStackIdAtIndex(IdOrIndex); 2391 } 2392 2393 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2394 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch( 2395 CallTy Call, EdgeIter &EI, 2396 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) { 2397 auto Edge = *EI; 2398 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee]; 2399 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller]; 2400 // Will be populated in order of callee to caller if we find a chain of tail 2401 // calls between the profiled caller and callee. 2402 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain; 2403 if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc, 2404 FoundCalleeChain)) 2405 return false; 2406 2407 // The usual case where the profiled callee matches that of the IR/summary. 2408 if (FoundCalleeChain.empty()) 2409 return true; 2410 2411 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) { 2412 auto *CurEdge = Callee->findEdgeFromCaller(Caller); 2413 // If there is already an edge between these nodes, simply update it and 2414 // return. 2415 if (CurEdge) { 2416 CurEdge->ContextIds.insert(Edge->ContextIds.begin(), 2417 Edge->ContextIds.end()); 2418 CurEdge->AllocTypes |= Edge->AllocTypes; 2419 return; 2420 } 2421 // Otherwise, create a new edge and insert it into the caller and callee 2422 // lists. 2423 auto NewEdge = std::make_shared<ContextEdge>( 2424 Callee, Caller, Edge->AllocTypes, Edge->ContextIds); 2425 Callee->CallerEdges.push_back(NewEdge); 2426 if (Caller == Edge->Caller) { 2427 // If we are inserting the new edge into the current edge's caller, insert 2428 // the new edge before the current iterator position, and then increment 2429 // back to the current edge. 2430 EI = Caller->CalleeEdges.insert(EI, NewEdge); 2431 ++EI; 2432 assert(*EI == Edge && 2433 "Iterator position not restored after insert and increment"); 2434 } else 2435 Caller->CalleeEdges.push_back(NewEdge); 2436 }; 2437 2438 // Create new nodes for each found callee and connect in between the profiled 2439 // caller and callee. 2440 auto *CurCalleeNode = Edge->Callee; 2441 for (auto &[NewCall, Func] : FoundCalleeChain) { 2442 ContextNode *NewNode = nullptr; 2443 // First check if we have already synthesized a node for this tail call. 2444 if (TailCallToContextNodeMap.count(NewCall)) { 2445 NewNode = TailCallToContextNodeMap[NewCall]; 2446 NewNode->AllocTypes |= Edge->AllocTypes; 2447 } else { 2448 FuncToCallsWithMetadata[Func].push_back({NewCall}); 2449 // Create Node and record node info. 2450 NewNode = createNewNode(/*IsAllocation=*/false, Func, NewCall); 2451 TailCallToContextNodeMap[NewCall] = NewNode; 2452 NewNode->AllocTypes = Edge->AllocTypes; 2453 } 2454 2455 // Hook up node to its callee node 2456 AddEdge(NewNode, CurCalleeNode); 2457 2458 CurCalleeNode = NewNode; 2459 } 2460 2461 // Hook up edge's original caller to new callee node. 2462 AddEdge(Edge->Caller, CurCalleeNode); 2463 2464 #ifndef NDEBUG 2465 // Save this because Edge's fields get cleared below when removed. 2466 auto *Caller = Edge->Caller; 2467 #endif 2468 2469 // Remove old edge 2470 removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true); 2471 2472 // To simplify the increment of EI in the caller, subtract one from EI. 2473 // In the final AddEdge call we would have either added a new callee edge, 2474 // to Edge->Caller, or found an existing one. Either way we are guaranteed 2475 // that there is at least one callee edge. 2476 assert(!Caller->CalleeEdges.empty()); 2477 --EI; 2478 2479 return true; 2480 } 2481 2482 bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls( 2483 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth, 2484 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain, 2485 bool &FoundMultipleCalleeChains) { 2486 // Stop recursive search if we have already explored the maximum specified 2487 // depth. 2488 if (Depth > TailCallSearchDepth) 2489 return false; 2490 2491 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) { 2492 FoundCalleeChain.push_back({Callsite, F}); 2493 }; 2494 2495 auto *CalleeFunc = dyn_cast<Function>(CurCallee); 2496 if (!CalleeFunc) { 2497 auto *Alias = dyn_cast<GlobalAlias>(CurCallee); 2498 assert(Alias); 2499 CalleeFunc = dyn_cast<Function>(Alias->getAliasee()); 2500 assert(CalleeFunc); 2501 } 2502 2503 // Look for tail calls in this function, and check if they either call the 2504 // profiled callee directly, or indirectly (via a recursive search). 2505 // Only succeed if there is a single unique tail call chain found between the 2506 // profiled caller and callee, otherwise we could perform incorrect cloning. 2507 bool FoundSingleCalleeChain = false; 2508 for (auto &BB : *CalleeFunc) { 2509 for (auto &I : BB) { 2510 auto *CB = dyn_cast<CallBase>(&I); 2511 if (!CB || !CB->isTailCall()) 2512 continue; 2513 auto *CalledValue = CB->getCalledOperand(); 2514 auto *CalledFunction = CB->getCalledFunction(); 2515 if (CalledValue && !CalledFunction) { 2516 CalledValue = CalledValue->stripPointerCasts(); 2517 // Stripping pointer casts can reveal a called function. 2518 CalledFunction = dyn_cast<Function>(CalledValue); 2519 } 2520 // Check if this is an alias to a function. If so, get the 2521 // called aliasee for the checks below. 2522 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) { 2523 assert(!CalledFunction && 2524 "Expected null called function in callsite for alias"); 2525 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject()); 2526 } 2527 if (!CalledFunction) 2528 continue; 2529 if (CalledFunction == ProfiledCallee) { 2530 if (FoundSingleCalleeChain) { 2531 FoundMultipleCalleeChains = true; 2532 return false; 2533 } 2534 FoundSingleCalleeChain = true; 2535 FoundProfiledCalleeCount++; 2536 FoundProfiledCalleeDepth += Depth; 2537 if (Depth > FoundProfiledCalleeMaxDepth) 2538 FoundProfiledCalleeMaxDepth = Depth; 2539 SaveCallsiteInfo(&I, CalleeFunc); 2540 } else if (findProfiledCalleeThroughTailCalls( 2541 ProfiledCallee, CalledFunction, Depth + 1, 2542 FoundCalleeChain, FoundMultipleCalleeChains)) { 2543 // findProfiledCalleeThroughTailCalls should not have returned 2544 // true if FoundMultipleCalleeChains. 2545 assert(!FoundMultipleCalleeChains); 2546 if (FoundSingleCalleeChain) { 2547 FoundMultipleCalleeChains = true; 2548 return false; 2549 } 2550 FoundSingleCalleeChain = true; 2551 SaveCallsiteInfo(&I, CalleeFunc); 2552 } else if (FoundMultipleCalleeChains) 2553 return false; 2554 } 2555 } 2556 2557 return FoundSingleCalleeChain; 2558 } 2559 2560 const Function *ModuleCallsiteContextGraph::getCalleeFunc(Instruction *Call) { 2561 auto *CB = dyn_cast<CallBase>(Call); 2562 if (!CB->getCalledOperand() || CB->isIndirectCall()) 2563 return nullptr; 2564 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts(); 2565 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal); 2566 if (Alias) 2567 return dyn_cast<Function>(Alias->getAliasee()); 2568 return dyn_cast<Function>(CalleeVal); 2569 } 2570 2571 bool ModuleCallsiteContextGraph::calleeMatchesFunc( 2572 Instruction *Call, const Function *Func, const Function *CallerFunc, 2573 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) { 2574 auto *CB = dyn_cast<CallBase>(Call); 2575 if (!CB->getCalledOperand() || CB->isIndirectCall()) 2576 return false; 2577 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts(); 2578 auto *CalleeFunc = dyn_cast<Function>(CalleeVal); 2579 if (CalleeFunc == Func) 2580 return true; 2581 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal); 2582 if (Alias && Alias->getAliasee() == Func) 2583 return true; 2584 2585 // Recursively search for the profiled callee through tail calls starting with 2586 // the actual Callee. The discovered tail call chain is saved in 2587 // FoundCalleeChain, and we will fixup the graph to include these callsites 2588 // after returning. 2589 // FIXME: We will currently redo the same recursive walk if we find the same 2590 // mismatched callee from another callsite. We can improve this with more 2591 // bookkeeping of the created chain of new nodes for each mismatch. 2592 unsigned Depth = 1; 2593 bool FoundMultipleCalleeChains = false; 2594 if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth, 2595 FoundCalleeChain, 2596 FoundMultipleCalleeChains)) { 2597 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " 2598 << Func->getName() << " from " << CallerFunc->getName() 2599 << " that actually called " << CalleeVal->getName() 2600 << (FoundMultipleCalleeChains 2601 ? " (found multiple possible chains)" 2602 : "") 2603 << "\n"); 2604 if (FoundMultipleCalleeChains) 2605 FoundProfiledCalleeNonUniquelyCount++; 2606 return false; 2607 } 2608 2609 return true; 2610 } 2611 2612 bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1, 2613 Instruction *Call2) { 2614 auto *CB1 = cast<CallBase>(Call1); 2615 if (!CB1->getCalledOperand() || CB1->isIndirectCall()) 2616 return false; 2617 auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts(); 2618 auto *CalleeFunc1 = dyn_cast<Function>(CalleeVal1); 2619 auto *CB2 = cast<CallBase>(Call2); 2620 if (!CB2->getCalledOperand() || CB2->isIndirectCall()) 2621 return false; 2622 auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts(); 2623 auto *CalleeFunc2 = dyn_cast<Function>(CalleeVal2); 2624 return CalleeFunc1 == CalleeFunc2; 2625 } 2626 2627 bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls( 2628 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth, 2629 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain, 2630 bool &FoundMultipleCalleeChains) { 2631 // Stop recursive search if we have already explored the maximum specified 2632 // depth. 2633 if (Depth > TailCallSearchDepth) 2634 return false; 2635 2636 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) { 2637 // Make a CallsiteInfo for each discovered callee, if one hasn't already 2638 // been synthesized. 2639 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) || 2640 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee)) 2641 // StackIds is empty (we don't have debug info available in the index for 2642 // these callsites) 2643 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] = 2644 std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>()); 2645 CallsiteInfo *NewCallsiteInfo = 2646 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get(); 2647 FoundCalleeChain.push_back({NewCallsiteInfo, FS}); 2648 }; 2649 2650 // Look for tail calls in this function, and check if they either call the 2651 // profiled callee directly, or indirectly (via a recursive search). 2652 // Only succeed if there is a single unique tail call chain found between the 2653 // profiled caller and callee, otherwise we could perform incorrect cloning. 2654 bool FoundSingleCalleeChain = false; 2655 for (auto &S : CurCallee.getSummaryList()) { 2656 if (!GlobalValue::isLocalLinkage(S->linkage()) && 2657 !isPrevailing(CurCallee.getGUID(), S.get())) 2658 continue; 2659 auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject()); 2660 if (!FS) 2661 continue; 2662 auto FSVI = CurCallee; 2663 auto *AS = dyn_cast<AliasSummary>(S.get()); 2664 if (AS) 2665 FSVI = AS->getAliaseeVI(); 2666 for (auto &CallEdge : FS->calls()) { 2667 if (!CallEdge.second.hasTailCall()) 2668 continue; 2669 if (CallEdge.first == ProfiledCallee) { 2670 if (FoundSingleCalleeChain) { 2671 FoundMultipleCalleeChains = true; 2672 return false; 2673 } 2674 FoundSingleCalleeChain = true; 2675 FoundProfiledCalleeCount++; 2676 FoundProfiledCalleeDepth += Depth; 2677 if (Depth > FoundProfiledCalleeMaxDepth) 2678 FoundProfiledCalleeMaxDepth = Depth; 2679 CreateAndSaveCallsiteInfo(CallEdge.first, FS); 2680 // Add FS to FSToVIMap in case it isn't already there. 2681 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI); 2682 FSToVIMap[FS] = FSVI; 2683 } else if (findProfiledCalleeThroughTailCalls( 2684 ProfiledCallee, CallEdge.first, Depth + 1, 2685 FoundCalleeChain, FoundMultipleCalleeChains)) { 2686 // findProfiledCalleeThroughTailCalls should not have returned 2687 // true if FoundMultipleCalleeChains. 2688 assert(!FoundMultipleCalleeChains); 2689 if (FoundSingleCalleeChain) { 2690 FoundMultipleCalleeChains = true; 2691 return false; 2692 } 2693 FoundSingleCalleeChain = true; 2694 CreateAndSaveCallsiteInfo(CallEdge.first, FS); 2695 // Add FS to FSToVIMap in case it isn't already there. 2696 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI); 2697 FSToVIMap[FS] = FSVI; 2698 } else if (FoundMultipleCalleeChains) 2699 return false; 2700 } 2701 } 2702 2703 return FoundSingleCalleeChain; 2704 } 2705 2706 const FunctionSummary * 2707 IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) { 2708 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee; 2709 if (Callee.getSummaryList().empty()) 2710 return nullptr; 2711 return dyn_cast<FunctionSummary>(Callee.getSummaryList()[0]->getBaseObject()); 2712 } 2713 2714 bool IndexCallsiteContextGraph::calleeMatchesFunc( 2715 IndexCall &Call, const FunctionSummary *Func, 2716 const FunctionSummary *CallerFunc, 2717 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) { 2718 ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee; 2719 // If there is no summary list then this is a call to an externally defined 2720 // symbol. 2721 AliasSummary *Alias = 2722 Callee.getSummaryList().empty() 2723 ? nullptr 2724 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get()); 2725 assert(FSToVIMap.count(Func)); 2726 auto FuncVI = FSToVIMap[Func]; 2727 if (Callee == FuncVI || 2728 // If callee is an alias, check the aliasee, since only function 2729 // summary base objects will contain the stack node summaries and thus 2730 // get a context node. 2731 (Alias && Alias->getAliaseeVI() == FuncVI)) 2732 return true; 2733 2734 // Recursively search for the profiled callee through tail calls starting with 2735 // the actual Callee. The discovered tail call chain is saved in 2736 // FoundCalleeChain, and we will fixup the graph to include these callsites 2737 // after returning. 2738 // FIXME: We will currently redo the same recursive walk if we find the same 2739 // mismatched callee from another callsite. We can improve this with more 2740 // bookkeeping of the created chain of new nodes for each mismatch. 2741 unsigned Depth = 1; 2742 bool FoundMultipleCalleeChains = false; 2743 if (!findProfiledCalleeThroughTailCalls( 2744 FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) { 2745 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI 2746 << " from " << FSToVIMap[CallerFunc] 2747 << " that actually called " << Callee 2748 << (FoundMultipleCalleeChains 2749 ? " (found multiple possible chains)" 2750 : "") 2751 << "\n"); 2752 if (FoundMultipleCalleeChains) 2753 FoundProfiledCalleeNonUniquelyCount++; 2754 return false; 2755 } 2756 2757 return true; 2758 } 2759 2760 bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) { 2761 ValueInfo Callee1 = dyn_cast_if_present<CallsiteInfo *>(Call1)->Callee; 2762 ValueInfo Callee2 = dyn_cast_if_present<CallsiteInfo *>(Call2)->Callee; 2763 return Callee1 == Callee2; 2764 } 2765 2766 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2767 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump() 2768 const { 2769 print(dbgs()); 2770 dbgs() << "\n"; 2771 } 2772 2773 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2774 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print( 2775 raw_ostream &OS) const { 2776 OS << "Node " << this << "\n"; 2777 OS << "\t"; 2778 printCall(OS); 2779 if (Recursive) 2780 OS << " (recursive)"; 2781 OS << "\n"; 2782 if (!MatchingCalls.empty()) { 2783 OS << "\tMatchingCalls:\n"; 2784 for (auto &MatchingCall : MatchingCalls) { 2785 OS << "\t"; 2786 MatchingCall.print(OS); 2787 OS << "\n"; 2788 } 2789 } 2790 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n"; 2791 OS << "\tContextIds:"; 2792 // Make a copy of the computed context ids that we can sort for stability. 2793 auto ContextIds = getContextIds(); 2794 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); 2795 std::sort(SortedIds.begin(), SortedIds.end()); 2796 for (auto Id : SortedIds) 2797 OS << " " << Id; 2798 OS << "\n"; 2799 OS << "\tCalleeEdges:\n"; 2800 for (auto &Edge : CalleeEdges) 2801 OS << "\t\t" << *Edge << "\n"; 2802 OS << "\tCallerEdges:\n"; 2803 for (auto &Edge : CallerEdges) 2804 OS << "\t\t" << *Edge << "\n"; 2805 if (!Clones.empty()) { 2806 OS << "\tClones: "; 2807 ListSeparator LS; 2808 for (auto *Clone : Clones) 2809 OS << LS << Clone; 2810 OS << "\n"; 2811 } else if (CloneOf) { 2812 OS << "\tClone of " << CloneOf << "\n"; 2813 } 2814 } 2815 2816 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2817 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump() 2818 const { 2819 print(dbgs()); 2820 dbgs() << "\n"; 2821 } 2822 2823 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2824 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print( 2825 raw_ostream &OS) const { 2826 OS << "Edge from Callee " << Callee << " to Caller: " << Caller 2827 << " AllocTypes: " << getAllocTypeString(AllocTypes); 2828 OS << " ContextIds:"; 2829 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); 2830 std::sort(SortedIds.begin(), SortedIds.end()); 2831 for (auto Id : SortedIds) 2832 OS << " " << Id; 2833 } 2834 2835 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2836 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const { 2837 print(dbgs()); 2838 } 2839 2840 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2841 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print( 2842 raw_ostream &OS) const { 2843 OS << "Callsite Context Graph:\n"; 2844 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; 2845 for (const auto Node : nodes<GraphType>(this)) { 2846 if (Node->isRemoved()) 2847 continue; 2848 Node->print(OS); 2849 OS << "\n"; 2850 } 2851 } 2852 2853 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2854 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes( 2855 raw_ostream &OS) const { 2856 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; 2857 for (const auto Node : nodes<GraphType>(this)) { 2858 if (Node->isRemoved()) 2859 continue; 2860 if (!Node->IsAllocation) 2861 continue; 2862 DenseSet<uint32_t> ContextIds = Node->getContextIds(); 2863 auto AllocTypeFromCall = getAllocationCallType(Node->Call); 2864 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); 2865 std::sort(SortedIds.begin(), SortedIds.end()); 2866 for (auto Id : SortedIds) { 2867 auto TypeI = ContextIdToAllocationType.find(Id); 2868 assert(TypeI != ContextIdToAllocationType.end()); 2869 auto CSI = ContextIdToContextSizeInfos.find(Id); 2870 if (CSI != ContextIdToContextSizeInfos.end()) { 2871 for (auto &Info : CSI->second) { 2872 OS << "MemProf hinting: " 2873 << getAllocTypeString((uint8_t)TypeI->second) 2874 << " full allocation context " << Info.FullStackId 2875 << " with total size " << Info.TotalSize << " is " 2876 << getAllocTypeString(Node->AllocTypes) << " after cloning"; 2877 if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall) 2878 OS << " marked " << getAllocTypeString((uint8_t)AllocTypeFromCall) 2879 << " due to cold byte percent"; 2880 OS << "\n"; 2881 } 2882 } 2883 } 2884 } 2885 } 2886 2887 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2888 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const { 2889 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; 2890 for (const auto Node : nodes<GraphType>(this)) { 2891 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false); 2892 for (auto &Edge : Node->CallerEdges) 2893 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); 2894 } 2895 } 2896 2897 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2898 struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> { 2899 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; 2900 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *; 2901 2902 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>; 2903 static NodeRef getNode(const NodePtrTy &P) { return P.get(); } 2904 2905 using nodes_iterator = 2906 mapped_iterator<typename std::vector<NodePtrTy>::const_iterator, 2907 decltype(&getNode)>; 2908 2909 static nodes_iterator nodes_begin(GraphType G) { 2910 return nodes_iterator(G->NodeOwner.begin(), &getNode); 2911 } 2912 2913 static nodes_iterator nodes_end(GraphType G) { 2914 return nodes_iterator(G->NodeOwner.end(), &getNode); 2915 } 2916 2917 static NodeRef getEntryNode(GraphType G) { 2918 return G->NodeOwner.begin()->get(); 2919 } 2920 2921 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>; 2922 static const ContextNode<DerivedCCG, FuncTy, CallTy> * 2923 GetCallee(const EdgePtrTy &P) { 2924 return P->Callee; 2925 } 2926 2927 using ChildIteratorType = 2928 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge< 2929 DerivedCCG, FuncTy, CallTy>>>::const_iterator, 2930 decltype(&GetCallee)>; 2931 2932 static ChildIteratorType child_begin(NodeRef N) { 2933 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee); 2934 } 2935 2936 static ChildIteratorType child_end(NodeRef N) { 2937 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee); 2938 } 2939 }; 2940 2941 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2942 struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> 2943 : public DefaultDOTGraphTraits { 2944 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {} 2945 2946 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; 2947 using GTraits = GraphTraits<GraphType>; 2948 using NodeRef = typename GTraits::NodeRef; 2949 using ChildIteratorType = typename GTraits::ChildIteratorType; 2950 2951 static std::string getNodeLabel(NodeRef Node, GraphType G) { 2952 std::string LabelString = 2953 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") + 2954 Twine(Node->OrigStackOrAllocId)) 2955 .str(); 2956 LabelString += "\n"; 2957 if (Node->hasCall()) { 2958 auto Func = G->NodeToCallingFunc.find(Node); 2959 assert(Func != G->NodeToCallingFunc.end()); 2960 LabelString += 2961 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo()); 2962 } else { 2963 LabelString += "null call"; 2964 if (Node->Recursive) 2965 LabelString += " (recursive)"; 2966 else 2967 LabelString += " (external)"; 2968 } 2969 return LabelString; 2970 } 2971 2972 static std::string getNodeAttributes(NodeRef Node, GraphType) { 2973 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " + 2974 getContextIds(Node->getContextIds()) + "\"") 2975 .str(); 2976 AttributeString += 2977 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str(); 2978 AttributeString += ",style=\"filled\""; 2979 if (Node->CloneOf) { 2980 AttributeString += ",color=\"blue\""; 2981 AttributeString += ",style=\"filled,bold,dashed\""; 2982 } else 2983 AttributeString += ",style=\"filled\""; 2984 return AttributeString; 2985 } 2986 2987 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, 2988 GraphType) { 2989 auto &Edge = *(ChildIter.getCurrent()); 2990 return (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" + 2991 Twine(",fillcolor=\"") + getColor(Edge->AllocTypes) + "\"") 2992 .str(); 2993 } 2994 2995 // Since the NodeOwners list includes nodes that are no longer connected to 2996 // the graph, skip them here. 2997 static bool isNodeHidden(NodeRef Node, GraphType) { 2998 return Node->isRemoved(); 2999 } 3000 3001 private: 3002 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) { 3003 std::string IdString = "ContextIds:"; 3004 if (ContextIds.size() < 100) { 3005 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); 3006 std::sort(SortedIds.begin(), SortedIds.end()); 3007 for (auto Id : SortedIds) 3008 IdString += (" " + Twine(Id)).str(); 3009 } else { 3010 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str(); 3011 } 3012 return IdString; 3013 } 3014 3015 static std::string getColor(uint8_t AllocTypes) { 3016 if (AllocTypes == (uint8_t)AllocationType::NotCold) 3017 // Color "brown1" actually looks like a lighter red. 3018 return "brown1"; 3019 if (AllocTypes == (uint8_t)AllocationType::Cold) 3020 return "cyan"; 3021 if (AllocTypes == 3022 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold)) 3023 // Lighter purple. 3024 return "mediumorchid1"; 3025 return "gray"; 3026 } 3027 3028 static std::string getNodeId(NodeRef Node) { 3029 std::stringstream SStream; 3030 SStream << std::hex << "N0x" << (unsigned long long)Node; 3031 std::string Result = SStream.str(); 3032 return Result; 3033 } 3034 }; 3035 3036 template <typename DerivedCCG, typename FuncTy, typename CallTy> 3037 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot( 3038 std::string Label) const { 3039 WriteGraph(this, "", false, Label, 3040 DotFilePathPrefix + "ccg." + Label + ".dot"); 3041 } 3042 3043 template <typename DerivedCCG, typename FuncTy, typename CallTy> 3044 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * 3045 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone( 3046 const std::shared_ptr<ContextEdge> &Edge, 3047 DenseSet<uint32_t> ContextIdsToMove) { 3048 ContextNode *Node = Edge->Callee; 3049 assert(NodeToCallingFunc.count(Node)); 3050 ContextNode *Clone = 3051 createNewNode(Node->IsAllocation, NodeToCallingFunc[Node], Node->Call); 3052 Node->addClone(Clone); 3053 Clone->MatchingCalls = Node->MatchingCalls; 3054 moveEdgeToExistingCalleeClone(Edge, Clone, /*NewClone=*/true, 3055 ContextIdsToMove); 3056 return Clone; 3057 } 3058 3059 template <typename DerivedCCG, typename FuncTy, typename CallTy> 3060 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: 3061 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge, 3062 ContextNode *NewCallee, bool NewClone, 3063 DenseSet<uint32_t> ContextIdsToMove) { 3064 // NewCallee and Edge's current callee must be clones of the same original 3065 // node (Edge's current callee may be the original node too). 3066 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode()); 3067 3068 ContextNode *OldCallee = Edge->Callee; 3069 3070 // We might already have an edge to the new callee from earlier cloning for a 3071 // different allocation. If one exists we will reuse it. 3072 auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller); 3073 3074 // Callers will pass an empty ContextIdsToMove set when they want to move the 3075 // edge. Copy in Edge's ids for simplicity. 3076 if (ContextIdsToMove.empty()) 3077 ContextIdsToMove = Edge->getContextIds(); 3078 3079 // If we are moving all of Edge's ids, then just move the whole Edge. 3080 // Otherwise only move the specified subset, to a new edge if needed. 3081 if (Edge->getContextIds().size() == ContextIdsToMove.size()) { 3082 // First, update the alloc types on New Callee from Edge. 3083 // Do this before we potentially clear Edge's fields below! 3084 NewCallee->AllocTypes |= Edge->AllocTypes; 3085 // Moving the whole Edge. 3086 if (ExistingEdgeToNewCallee) { 3087 // Since we already have an edge to NewCallee, simply move the ids 3088 // onto it, and remove the existing Edge. 3089 ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(), 3090 ContextIdsToMove.end()); 3091 ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes; 3092 assert(Edge->ContextIds == ContextIdsToMove); 3093 removeEdgeFromGraph(Edge.get()); 3094 } else { 3095 // Otherwise just reconnect Edge to NewCallee. 3096 Edge->Callee = NewCallee; 3097 NewCallee->CallerEdges.push_back(Edge); 3098 // Remove it from callee where it was previously connected. 3099 OldCallee->eraseCallerEdge(Edge.get()); 3100 // Don't need to update Edge's context ids since we are simply 3101 // reconnecting it. 3102 } 3103 } else { 3104 // Only moving a subset of Edge's ids. 3105 // Compute the alloc type of the subset of ids being moved. 3106 auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove); 3107 if (ExistingEdgeToNewCallee) { 3108 // Since we already have an edge to NewCallee, simply move the ids 3109 // onto it. 3110 ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(), 3111 ContextIdsToMove.end()); 3112 ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType; 3113 } else { 3114 // Otherwise, create a new edge to NewCallee for the ids being moved. 3115 auto NewEdge = std::make_shared<ContextEdge>( 3116 NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove); 3117 Edge->Caller->CalleeEdges.push_back(NewEdge); 3118 NewCallee->CallerEdges.push_back(NewEdge); 3119 } 3120 // In either case, need to update the alloc types on NewCallee, and remove 3121 // those ids and update the alloc type on the original Edge. 3122 NewCallee->AllocTypes |= CallerEdgeAllocType; 3123 set_subtract(Edge->ContextIds, ContextIdsToMove); 3124 Edge->AllocTypes = computeAllocType(Edge->ContextIds); 3125 } 3126 // Now walk the old callee node's callee edges and move Edge's context ids 3127 // over to the corresponding edge into the clone (which is created here if 3128 // this is a newly created clone). 3129 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) { 3130 // The context ids moving to the new callee are the subset of this edge's 3131 // context ids and the context ids on the caller edge being moved. 3132 DenseSet<uint32_t> EdgeContextIdsToMove = 3133 set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove); 3134 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove); 3135 OldCalleeEdge->AllocTypes = 3136 computeAllocType(OldCalleeEdge->getContextIds()); 3137 if (!NewClone) { 3138 // Update context ids / alloc type on corresponding edge to NewCallee. 3139 // There is a chance this may not exist if we are reusing an existing 3140 // clone, specifically during function assignment, where we would have 3141 // removed none type edges after creating the clone. If we can't find 3142 // a corresponding edge there, fall through to the cloning below. 3143 if (auto *NewCalleeEdge = 3144 NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) { 3145 NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(), 3146 EdgeContextIdsToMove.end()); 3147 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove); 3148 continue; 3149 } 3150 } 3151 auto NewEdge = std::make_shared<ContextEdge>( 3152 OldCalleeEdge->Callee, NewCallee, 3153 computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove); 3154 NewCallee->CalleeEdges.push_back(NewEdge); 3155 NewEdge->Callee->CallerEdges.push_back(NewEdge); 3156 } 3157 // Recompute the node alloc type now that its callee edges have been 3158 // updated (since we will compute from those edges). 3159 OldCallee->AllocTypes = OldCallee->computeAllocType(); 3160 // OldCallee alloc type should be None iff its context id set is now empty. 3161 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) == 3162 OldCallee->emptyContextIds()); 3163 if (VerifyCCG) { 3164 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false); 3165 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false); 3166 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges) 3167 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee, 3168 /*CheckEdges=*/false); 3169 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges) 3170 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee, 3171 /*CheckEdges=*/false); 3172 } 3173 } 3174 3175 template <typename DerivedCCG, typename FuncTy, typename CallTy> 3176 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: 3177 moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge, 3178 ContextNode *NewCaller) { 3179 3180 ContextNode *OldCaller = Edge->Caller; 3181 OldCaller->eraseCalleeEdge(Edge.get()); 3182 3183 // We might already have an edge to the new caller. If one exists we will 3184 // reuse it. 3185 auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(Edge->Callee); 3186 3187 if (ExistingEdgeToNewCaller) { 3188 // Since we already have an edge to NewCaller, simply move the ids 3189 // onto it, and remove the existing Edge. 3190 ExistingEdgeToNewCaller->getContextIds().insert( 3191 Edge->getContextIds().begin(), Edge->getContextIds().end()); 3192 ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes; 3193 Edge->ContextIds.clear(); 3194 Edge->AllocTypes = (uint8_t)AllocationType::None; 3195 Edge->Callee->eraseCallerEdge(Edge.get()); 3196 } else { 3197 // Otherwise just reconnect Edge to NewCaller. 3198 Edge->Caller = NewCaller; 3199 NewCaller->CalleeEdges.push_back(Edge); 3200 // Don't need to update Edge's context ids since we are simply 3201 // reconnecting it. 3202 } 3203 // In either case, need to update the alloc types on New Caller. 3204 NewCaller->AllocTypes |= Edge->AllocTypes; 3205 3206 // Now walk the old caller node's caller edges and move Edge's context ids 3207 // over to the corresponding edge into the node (which is created here if 3208 // this is a newly created node). We can tell whether this is a newly created 3209 // node by seeing if it has any caller edges yet. 3210 #ifndef NDEBUG 3211 bool IsNewNode = NewCaller->CallerEdges.empty(); 3212 #endif 3213 for (auto &OldCallerEdge : OldCaller->CallerEdges) { 3214 // The context ids moving to the new caller are the subset of this edge's 3215 // context ids and the context ids on the callee edge being moved. 3216 DenseSet<uint32_t> EdgeContextIdsToMove = 3217 set_intersection(OldCallerEdge->getContextIds(), Edge->getContextIds()); 3218 set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove); 3219 OldCallerEdge->AllocTypes = 3220 computeAllocType(OldCallerEdge->getContextIds()); 3221 // In this function we expect that any pre-existing node already has edges 3222 // from the same callers as the old node. That should be true in the current 3223 // use case, where we will remove None-type edges after copying over all 3224 // caller edges from the callee. 3225 auto *ExistingCallerEdge = 3226 NewCaller->findEdgeFromCaller(OldCallerEdge->Caller); 3227 assert(IsNewNode || ExistingCallerEdge); 3228 if (ExistingCallerEdge) { 3229 ExistingCallerEdge->getContextIds().insert(EdgeContextIdsToMove.begin(), 3230 EdgeContextIdsToMove.end()); 3231 ExistingCallerEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove); 3232 continue; 3233 } 3234 auto NewEdge = std::make_shared<ContextEdge>( 3235 NewCaller, OldCallerEdge->Caller, 3236 computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove); 3237 NewCaller->CallerEdges.push_back(NewEdge); 3238 NewEdge->Caller->CalleeEdges.push_back(NewEdge); 3239 } 3240 // Recompute the node alloc type now that its caller edges have been 3241 // updated (since we will compute from those edges). 3242 OldCaller->AllocTypes = OldCaller->computeAllocType(); 3243 // OldCaller alloc type should be None iff its context id set is now empty. 3244 assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) == 3245 OldCaller->emptyContextIds()); 3246 if (VerifyCCG) { 3247 checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /*CheckEdges=*/false); 3248 checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /*CheckEdges=*/false); 3249 for (const auto &OldCallerEdge : OldCaller->CallerEdges) 3250 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller, 3251 /*CheckEdges=*/false); 3252 for (const auto &NewCallerEdge : NewCaller->CallerEdges) 3253 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller, 3254 /*CheckEdges=*/false); 3255 } 3256 } 3257 3258 template <typename DerivedCCG, typename FuncTy, typename CallTy> 3259 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: 3260 recursivelyRemoveNoneTypeCalleeEdges( 3261 ContextNode *Node, DenseSet<const ContextNode *> &Visited) { 3262 auto Inserted = Visited.insert(Node); 3263 if (!Inserted.second) 3264 return; 3265 3266 removeNoneTypeCalleeEdges(Node); 3267 3268 for (auto *Clone : Node->Clones) 3269 recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited); 3270 3271 // The recursive call may remove some of this Node's caller edges. 3272 // Iterate over a copy and skip any that were removed. 3273 auto CallerEdges = Node->CallerEdges; 3274 for (auto &Edge : CallerEdges) { 3275 // Skip any that have been removed by an earlier recursive call. 3276 if (Edge->isRemoved()) { 3277 assert(!is_contained(Node->CallerEdges, Edge)); 3278 continue; 3279 } 3280 recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited); 3281 } 3282 } 3283 3284 template <typename DerivedCCG, typename FuncTy, typename CallTy> 3285 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() { 3286 DenseSet<const ContextNode *> Visited; 3287 for (auto &Entry : AllocationCallToContextNodeMap) { 3288 Visited.clear(); 3289 identifyClones(Entry.second, Visited, Entry.second->getContextIds()); 3290 } 3291 Visited.clear(); 3292 for (auto &Entry : AllocationCallToContextNodeMap) 3293 recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited); 3294 if (VerifyCCG) 3295 check(); 3296 } 3297 3298 // helper function to check an AllocType is cold or notcold or both. 3299 bool checkColdOrNotCold(uint8_t AllocType) { 3300 return (AllocType == (uint8_t)AllocationType::Cold) || 3301 (AllocType == (uint8_t)AllocationType::NotCold) || 3302 (AllocType == 3303 ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold)); 3304 } 3305 3306 template <typename DerivedCCG, typename FuncTy, typename CallTy> 3307 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones( 3308 ContextNode *Node, DenseSet<const ContextNode *> &Visited, 3309 const DenseSet<uint32_t> &AllocContextIds) { 3310 if (VerifyNodes) 3311 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false); 3312 assert(!Node->CloneOf); 3313 3314 // If Node as a null call, then either it wasn't found in the module (regular 3315 // LTO) or summary index (ThinLTO), or there were other conditions blocking 3316 // cloning (e.g. recursion, calls multiple targets, etc). 3317 // Do this here so that we don't try to recursively clone callers below, which 3318 // isn't useful at least for this node. 3319 if (!Node->hasCall()) 3320 return; 3321 3322 #ifndef NDEBUG 3323 auto Insert = 3324 #endif 3325 Visited.insert(Node); 3326 // We should not have visited this node yet. 3327 assert(Insert.second); 3328 // The recursive call to identifyClones may delete the current edge from the 3329 // CallerEdges vector. Make a copy and iterate on that, simpler than passing 3330 // in an iterator and having recursive call erase from it. Other edges may 3331 // also get removed during the recursion, which will have null Callee and 3332 // Caller pointers (and are deleted later), so we skip those below. 3333 { 3334 auto CallerEdges = Node->CallerEdges; 3335 for (auto &Edge : CallerEdges) { 3336 // Skip any that have been removed by an earlier recursive call. 3337 if (Edge->isRemoved()) { 3338 assert(!is_contained(Node->CallerEdges, Edge)); 3339 continue; 3340 } 3341 // Ignore any caller we previously visited via another edge. 3342 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) { 3343 identifyClones(Edge->Caller, Visited, AllocContextIds); 3344 } 3345 } 3346 } 3347 3348 // Check if we reached an unambiguous call or have have only a single caller. 3349 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1) 3350 return; 3351 3352 // We need to clone. 3353 3354 // Try to keep the original version as alloc type NotCold. This will make 3355 // cases with indirect calls or any other situation with an unknown call to 3356 // the original function get the default behavior. We do this by sorting the 3357 // CallerEdges of the Node we will clone by alloc type. 3358 // 3359 // Give NotCold edge the lowest sort priority so those edges are at the end of 3360 // the caller edges vector, and stay on the original version (since the below 3361 // code clones greedily until it finds all remaining edges have the same type 3362 // and leaves the remaining ones on the original Node). 3363 // 3364 // We shouldn't actually have any None type edges, so the sorting priority for 3365 // that is arbitrary, and we assert in that case below. 3366 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4, 3367 /*Cold*/ 1, 3368 /*NotColdCold*/ 2}; 3369 std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(), 3370 [&](const std::shared_ptr<ContextEdge> &A, 3371 const std::shared_ptr<ContextEdge> &B) { 3372 // Nodes with non-empty context ids should be sorted before 3373 // those with empty context ids. 3374 if (A->ContextIds.empty()) 3375 // Either B ContextIds are non-empty (in which case we 3376 // should return false because B < A), or B ContextIds 3377 // are empty, in which case they are equal, and we should 3378 // maintain the original relative ordering. 3379 return false; 3380 if (B->ContextIds.empty()) 3381 return true; 3382 3383 if (A->AllocTypes == B->AllocTypes) 3384 // Use the first context id for each edge as a 3385 // tie-breaker. 3386 return *A->ContextIds.begin() < *B->ContextIds.begin(); 3387 return AllocTypeCloningPriority[A->AllocTypes] < 3388 AllocTypeCloningPriority[B->AllocTypes]; 3389 }); 3390 3391 assert(Node->AllocTypes != (uint8_t)AllocationType::None); 3392 3393 DenseSet<uint32_t> RecursiveContextIds; 3394 // If we are allowing recursive callsites, but have also disabled recursive 3395 // contexts, look for context ids that show up in multiple caller edges. 3396 if (AllowRecursiveCallsites && !AllowRecursiveContexts) { 3397 DenseSet<uint32_t> AllCallerContextIds; 3398 for (auto &CE : Node->CallerEdges) { 3399 // Resize to the largest set of caller context ids, since we know the 3400 // final set will be at least that large. 3401 AllCallerContextIds.reserve(CE->getContextIds().size()); 3402 for (auto Id : CE->getContextIds()) 3403 if (!AllCallerContextIds.insert(Id).second) 3404 RecursiveContextIds.insert(Id); 3405 } 3406 } 3407 3408 // Iterate until we find no more opportunities for disambiguating the alloc 3409 // types via cloning. In most cases this loop will terminate once the Node 3410 // has a single allocation type, in which case no more cloning is needed. 3411 // Iterate over a copy of Node's caller edges, since we may need to remove 3412 // edges in the moveEdgeTo* methods, and this simplifies the handling and 3413 // makes it less error-prone. 3414 auto CallerEdges = Node->CallerEdges; 3415 for (auto &CallerEdge : CallerEdges) { 3416 // See if cloning the prior caller edge left this node with a single alloc 3417 // type or a single caller. In that case no more cloning of Node is needed. 3418 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1) 3419 break; 3420 3421 // If the caller was not successfully matched to a call in the IR/summary, 3422 // there is no point in trying to clone for it as we can't update that call. 3423 if (!CallerEdge->Caller->hasCall()) 3424 continue; 3425 3426 // Only need to process the ids along this edge pertaining to the given 3427 // allocation. 3428 auto CallerEdgeContextsForAlloc = 3429 set_intersection(CallerEdge->getContextIds(), AllocContextIds); 3430 if (!RecursiveContextIds.empty()) 3431 CallerEdgeContextsForAlloc = 3432 set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds); 3433 if (CallerEdgeContextsForAlloc.empty()) 3434 continue; 3435 3436 auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc); 3437 3438 // Compute the node callee edge alloc types corresponding to the context ids 3439 // for this caller edge. 3440 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge; 3441 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size()); 3442 for (auto &CalleeEdge : Node->CalleeEdges) 3443 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes( 3444 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc)); 3445 3446 // Don't clone if doing so will not disambiguate any alloc types amongst 3447 // caller edges (including the callee edges that would be cloned). 3448 // Otherwise we will simply move all edges to the clone. 3449 // 3450 // First check if by cloning we will disambiguate the caller allocation 3451 // type from node's allocation type. Query allocTypeToUse so that we don't 3452 // bother cloning to distinguish NotCold+Cold from NotCold. Note that 3453 // neither of these should be None type. 3454 // 3455 // Then check if by cloning node at least one of the callee edges will be 3456 // disambiguated by splitting out different context ids. 3457 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None); 3458 assert(Node->AllocTypes != (uint8_t)AllocationType::None); 3459 if (allocTypeToUse(CallerAllocTypeForAlloc) == 3460 allocTypeToUse(Node->AllocTypes) && 3461 allocTypesMatch<DerivedCCG, FuncTy, CallTy>( 3462 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) 3463 continue; 3464 3465 // First see if we can use an existing clone. Check each clone and its 3466 // callee edges for matching alloc types. 3467 ContextNode *Clone = nullptr; 3468 for (auto *CurClone : Node->Clones) { 3469 if (allocTypeToUse(CurClone->AllocTypes) != 3470 allocTypeToUse(CallerAllocTypeForAlloc)) 3471 continue; 3472 3473 bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) && 3474 hasSingleAllocType(CallerAllocTypeForAlloc); 3475 // The above check should mean that if both have single alloc types that 3476 // they should be equal. 3477 assert(!BothSingleAlloc || 3478 CurClone->AllocTypes == CallerAllocTypeForAlloc); 3479 3480 // If either both have a single alloc type (which are the same), or if the 3481 // clone's callee edges have the same alloc types as those for the current 3482 // allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge), 3483 // then we can reuse this clone. 3484 if (BothSingleAlloc || allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>( 3485 CalleeEdgeAllocTypesForCallerEdge, CurClone)) { 3486 Clone = CurClone; 3487 break; 3488 } 3489 } 3490 3491 // The edge iterator is adjusted when we move the CallerEdge to the clone. 3492 if (Clone) 3493 moveEdgeToExistingCalleeClone(CallerEdge, Clone, /*NewClone=*/false, 3494 CallerEdgeContextsForAlloc); 3495 else 3496 Clone = moveEdgeToNewCalleeClone(CallerEdge, CallerEdgeContextsForAlloc); 3497 3498 // Sanity check that no alloc types on clone or its edges are None. 3499 assert(Clone->AllocTypes != (uint8_t)AllocationType::None); 3500 } 3501 3502 // We should still have some context ids on the original Node. 3503 assert(!Node->emptyContextIds()); 3504 3505 // Sanity check that no alloc types on node or edges are None. 3506 assert(Node->AllocTypes != (uint8_t)AllocationType::None); 3507 3508 if (VerifyNodes) 3509 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false); 3510 } 3511 3512 void ModuleCallsiteContextGraph::updateAllocationCall( 3513 CallInfo &Call, AllocationType AllocType) { 3514 std::string AllocTypeString = getAllocTypeAttributeString(AllocType); 3515 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(), 3516 "memprof", AllocTypeString); 3517 cast<CallBase>(Call.call())->addFnAttr(A); 3518 OREGetter(Call.call()->getFunction()) 3519 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call()) 3520 << ore::NV("AllocationCall", Call.call()) << " in clone " 3521 << ore::NV("Caller", Call.call()->getFunction()) 3522 << " marked with memprof allocation attribute " 3523 << ore::NV("Attribute", AllocTypeString)); 3524 } 3525 3526 void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call, 3527 AllocationType AllocType) { 3528 auto *AI = cast<AllocInfo *>(Call.call()); 3529 assert(AI); 3530 assert(AI->Versions.size() > Call.cloneNo()); 3531 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType; 3532 } 3533 3534 AllocationType 3535 ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const { 3536 const auto *CB = cast<CallBase>(Call.call()); 3537 if (!CB->getAttributes().hasFnAttr("memprof")) 3538 return AllocationType::None; 3539 return CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold" 3540 ? AllocationType::Cold 3541 : AllocationType::NotCold; 3542 } 3543 3544 AllocationType 3545 IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const { 3546 const auto *AI = cast<AllocInfo *>(Call.call()); 3547 assert(AI->Versions.size() > Call.cloneNo()); 3548 return (AllocationType)AI->Versions[Call.cloneNo()]; 3549 } 3550 3551 void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall, 3552 FuncInfo CalleeFunc) { 3553 if (CalleeFunc.cloneNo() > 0) 3554 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func()); 3555 OREGetter(CallerCall.call()->getFunction()) 3556 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call()) 3557 << ore::NV("Call", CallerCall.call()) << " in clone " 3558 << ore::NV("Caller", CallerCall.call()->getFunction()) 3559 << " assigned to call function clone " 3560 << ore::NV("Callee", CalleeFunc.func())); 3561 } 3562 3563 void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall, 3564 FuncInfo CalleeFunc) { 3565 auto *CI = cast<CallsiteInfo *>(CallerCall.call()); 3566 assert(CI && 3567 "Caller cannot be an allocation which should not have profiled calls"); 3568 assert(CI->Clones.size() > CallerCall.cloneNo()); 3569 CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo(); 3570 } 3571 3572 CallsiteContextGraph<ModuleCallsiteContextGraph, Function, 3573 Instruction *>::FuncInfo 3574 ModuleCallsiteContextGraph::cloneFunctionForCallsite( 3575 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, 3576 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { 3577 // Use existing LLVM facilities for cloning and obtaining Call in clone 3578 ValueToValueMapTy VMap; 3579 auto *NewFunc = CloneFunction(Func.func(), VMap); 3580 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo); 3581 assert(!Func.func()->getParent()->getFunction(Name)); 3582 NewFunc->setName(Name); 3583 for (auto &Inst : CallsWithMetadataInFunc) { 3584 // This map always has the initial version in it. 3585 assert(Inst.cloneNo() == 0); 3586 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo}; 3587 } 3588 OREGetter(Func.func()) 3589 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func()) 3590 << "created clone " << ore::NV("NewFunction", NewFunc)); 3591 return {NewFunc, CloneNo}; 3592 } 3593 3594 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, 3595 IndexCall>::FuncInfo 3596 IndexCallsiteContextGraph::cloneFunctionForCallsite( 3597 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, 3598 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { 3599 // Check how many clones we have of Call (and therefore function). 3600 // The next clone number is the current size of versions array. 3601 // Confirm this matches the CloneNo provided by the caller, which is based on 3602 // the number of function clones we have. 3603 assert(CloneNo == (isa<AllocInfo *>(Call.call()) 3604 ? cast<AllocInfo *>(Call.call())->Versions.size() 3605 : cast<CallsiteInfo *>(Call.call())->Clones.size())); 3606 // Walk all the instructions in this function. Create a new version for 3607 // each (by adding an entry to the Versions/Clones summary array), and copy 3608 // over the version being called for the function clone being cloned here. 3609 // Additionally, add an entry to the CallMap for the new function clone, 3610 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc) 3611 // to the new call clone. 3612 for (auto &Inst : CallsWithMetadataInFunc) { 3613 // This map always has the initial version in it. 3614 assert(Inst.cloneNo() == 0); 3615 if (auto *AI = dyn_cast<AllocInfo *>(Inst.call())) { 3616 assert(AI->Versions.size() == CloneNo); 3617 // We assign the allocation type later (in updateAllocationCall), just add 3618 // an entry for it here. 3619 AI->Versions.push_back(0); 3620 } else { 3621 auto *CI = cast<CallsiteInfo *>(Inst.call()); 3622 assert(CI && CI->Clones.size() == CloneNo); 3623 // We assign the clone number later (in updateCall), just add an entry for 3624 // it here. 3625 CI->Clones.push_back(0); 3626 } 3627 CallMap[Inst] = {Inst.call(), CloneNo}; 3628 } 3629 return {Func.func(), CloneNo}; 3630 } 3631 3632 // This method assigns cloned callsites to functions, cloning the functions as 3633 // needed. The assignment is greedy and proceeds roughly as follows: 3634 // 3635 // For each function Func: 3636 // For each call with graph Node having clones: 3637 // Initialize ClonesWorklist to Node and its clones 3638 // Initialize NodeCloneCount to 0 3639 // While ClonesWorklist is not empty: 3640 // Clone = pop front ClonesWorklist 3641 // NodeCloneCount++ 3642 // If Func has been cloned less than NodeCloneCount times: 3643 // If NodeCloneCount is 1: 3644 // Assign Clone to original Func 3645 // Continue 3646 // Create a new function clone 3647 // If other callers not assigned to call a function clone yet: 3648 // Assign them to call new function clone 3649 // Continue 3650 // Assign any other caller calling the cloned version to new clone 3651 // 3652 // For each caller of Clone: 3653 // If caller is assigned to call a specific function clone: 3654 // If we cannot assign Clone to that function clone: 3655 // Create new callsite Clone NewClone 3656 // Add NewClone to ClonesWorklist 3657 // Continue 3658 // Assign Clone to existing caller's called function clone 3659 // Else: 3660 // If Clone not already assigned to a function clone: 3661 // Assign to first function clone without assignment 3662 // Assign caller to selected function clone 3663 template <typename DerivedCCG, typename FuncTy, typename CallTy> 3664 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { 3665 bool Changed = false; 3666 3667 // Keep track of the assignment of nodes (callsites) to function clones they 3668 // call. 3669 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap; 3670 3671 // Update caller node to call function version CalleeFunc, by recording the 3672 // assignment in CallsiteToCalleeFuncCloneMap. 3673 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller, 3674 const FuncInfo &CalleeFunc) { 3675 assert(Caller->hasCall()); 3676 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc; 3677 }; 3678 3679 // Walk all functions for which we saw calls with memprof metadata, and handle 3680 // cloning for each of its calls. 3681 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) { 3682 FuncInfo OrigFunc(Func); 3683 // Map from each clone of OrigFunc to a map of remappings of each call of 3684 // interest (from original uncloned call to the corresponding cloned call in 3685 // that function clone). 3686 std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap; 3687 for (auto &Call : CallsWithMetadata) { 3688 ContextNode *Node = getNodeForInst(Call); 3689 // Skip call if we do not have a node for it (all uses of its stack ids 3690 // were either on inlined chains or pruned from the MIBs), or if we did 3691 // not create any clones for it. 3692 if (!Node || Node->Clones.empty()) 3693 continue; 3694 assert(Node->hasCall() && 3695 "Not having a call should have prevented cloning"); 3696 3697 // Track the assignment of function clones to clones of the current 3698 // callsite Node being handled. 3699 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap; 3700 3701 // Assign callsite version CallsiteClone to function version FuncClone, 3702 // and also assign (possibly cloned) Call to CallsiteClone. 3703 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone, 3704 CallInfo &Call, 3705 ContextNode *CallsiteClone, 3706 bool IsAlloc) { 3707 // Record the clone of callsite node assigned to this function clone. 3708 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone; 3709 3710 assert(FuncClonesToCallMap.count(FuncClone)); 3711 std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone]; 3712 CallInfo CallClone(Call); 3713 if (CallMap.count(Call)) 3714 CallClone = CallMap[Call]; 3715 CallsiteClone->setCall(CallClone); 3716 // Need to do the same for all matching calls. 3717 for (auto &MatchingCall : Node->MatchingCalls) { 3718 CallInfo CallClone(MatchingCall); 3719 if (CallMap.count(MatchingCall)) 3720 CallClone = CallMap[MatchingCall]; 3721 // Updates the call in the list. 3722 MatchingCall = CallClone; 3723 } 3724 }; 3725 3726 // Keep track of the clones of callsite Node that need to be assigned to 3727 // function clones. This list may be expanded in the loop body below if we 3728 // find additional cloning is required. 3729 std::deque<ContextNode *> ClonesWorklist; 3730 // Ignore original Node if we moved all of its contexts to clones. 3731 if (!Node->emptyContextIds()) 3732 ClonesWorklist.push_back(Node); 3733 ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(), 3734 Node->Clones.end()); 3735 3736 // Now walk through all of the clones of this callsite Node that we need, 3737 // and determine the assignment to a corresponding clone of the current 3738 // function (creating new function clones as needed). 3739 unsigned NodeCloneCount = 0; 3740 while (!ClonesWorklist.empty()) { 3741 ContextNode *Clone = ClonesWorklist.front(); 3742 ClonesWorklist.pop_front(); 3743 NodeCloneCount++; 3744 if (VerifyNodes) 3745 checkNode<DerivedCCG, FuncTy, CallTy>(Clone); 3746 3747 // Need to create a new function clone if we have more callsite clones 3748 // than existing function clones, which would have been assigned to an 3749 // earlier clone in the list (we assign callsite clones to function 3750 // clones greedily). 3751 if (FuncClonesToCallMap.size() < NodeCloneCount) { 3752 // If this is the first callsite copy, assign to original function. 3753 if (NodeCloneCount == 1) { 3754 // Since FuncClonesToCallMap is empty in this case, no clones have 3755 // been created for this function yet, and no callers should have 3756 // been assigned a function clone for this callee node yet. 3757 assert(llvm::none_of( 3758 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) { 3759 return CallsiteToCalleeFuncCloneMap.count(E->Caller); 3760 })); 3761 // Initialize with empty call map, assign Clone to original function 3762 // and its callers, and skip to the next clone. 3763 FuncClonesToCallMap[OrigFunc] = {}; 3764 AssignCallsiteCloneToFuncClone( 3765 OrigFunc, Call, Clone, 3766 AllocationCallToContextNodeMap.count(Call)); 3767 for (auto &CE : Clone->CallerEdges) { 3768 // Ignore any caller that does not have a recorded callsite Call. 3769 if (!CE->Caller->hasCall()) 3770 continue; 3771 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc); 3772 } 3773 continue; 3774 } 3775 3776 // First locate which copy of OrigFunc to clone again. If a caller 3777 // of this callsite clone was already assigned to call a particular 3778 // function clone, we need to redirect all of those callers to the 3779 // new function clone, and update their other callees within this 3780 // function. 3781 FuncInfo PreviousAssignedFuncClone; 3782 auto EI = llvm::find_if( 3783 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) { 3784 return CallsiteToCalleeFuncCloneMap.count(E->Caller); 3785 }); 3786 bool CallerAssignedToCloneOfFunc = false; 3787 if (EI != Clone->CallerEdges.end()) { 3788 const std::shared_ptr<ContextEdge> &Edge = *EI; 3789 PreviousAssignedFuncClone = 3790 CallsiteToCalleeFuncCloneMap[Edge->Caller]; 3791 CallerAssignedToCloneOfFunc = true; 3792 } 3793 3794 // Clone function and save it along with the CallInfo map created 3795 // during cloning in the FuncClonesToCallMap. 3796 std::map<CallInfo, CallInfo> NewCallMap; 3797 unsigned CloneNo = FuncClonesToCallMap.size(); 3798 assert(CloneNo > 0 && "Clone 0 is the original function, which " 3799 "should already exist in the map"); 3800 FuncInfo NewFuncClone = cloneFunctionForCallsite( 3801 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo); 3802 FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap)); 3803 FunctionClonesAnalysis++; 3804 Changed = true; 3805 3806 // If no caller callsites were already assigned to a clone of this 3807 // function, we can simply assign this clone to the new func clone 3808 // and update all callers to it, then skip to the next clone. 3809 if (!CallerAssignedToCloneOfFunc) { 3810 AssignCallsiteCloneToFuncClone( 3811 NewFuncClone, Call, Clone, 3812 AllocationCallToContextNodeMap.count(Call)); 3813 for (auto &CE : Clone->CallerEdges) { 3814 // Ignore any caller that does not have a recorded callsite Call. 3815 if (!CE->Caller->hasCall()) 3816 continue; 3817 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone); 3818 } 3819 continue; 3820 } 3821 3822 // We may need to do additional node cloning in this case. 3823 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers 3824 // that were previously assigned to call PreviousAssignedFuncClone, 3825 // to record that they now call NewFuncClone. 3826 // The none type edge removal may remove some of this Clone's caller 3827 // edges, if it is reached via another of its caller's callees. 3828 // Iterate over a copy and skip any that were removed. 3829 auto CallerEdges = Clone->CallerEdges; 3830 for (auto CE : CallerEdges) { 3831 // Skip any that have been removed on an earlier iteration. 3832 if (CE->isRemoved()) { 3833 assert(!is_contained(Clone->CallerEdges, CE)); 3834 continue; 3835 } 3836 assert(CE); 3837 // Ignore any caller that does not have a recorded callsite Call. 3838 if (!CE->Caller->hasCall()) 3839 continue; 3840 3841 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) || 3842 // We subsequently fall through to later handling that 3843 // will perform any additional cloning required for 3844 // callers that were calling other function clones. 3845 CallsiteToCalleeFuncCloneMap[CE->Caller] != 3846 PreviousAssignedFuncClone) 3847 continue; 3848 3849 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone); 3850 3851 // If we are cloning a function that was already assigned to some 3852 // callers, then essentially we are creating new callsite clones 3853 // of the other callsites in that function that are reached by those 3854 // callers. Clone the other callees of the current callsite's caller 3855 // that were already assigned to PreviousAssignedFuncClone 3856 // accordingly. This is important since we subsequently update the 3857 // calls from the nodes in the graph and their assignments to callee 3858 // functions recorded in CallsiteToCalleeFuncCloneMap. 3859 // The none type edge removal may remove some of this caller's 3860 // callee edges, if it is reached via another of its callees. 3861 // Iterate over a copy and skip any that were removed. 3862 auto CalleeEdges = CE->Caller->CalleeEdges; 3863 for (auto CalleeEdge : CalleeEdges) { 3864 // Skip any that have been removed on an earlier iteration when 3865 // cleaning up newly None type callee edges. 3866 if (CalleeEdge->isRemoved()) { 3867 assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge)); 3868 continue; 3869 } 3870 assert(CalleeEdge); 3871 ContextNode *Callee = CalleeEdge->Callee; 3872 // Skip the current callsite, we are looking for other 3873 // callsites Caller calls, as well as any that does not have a 3874 // recorded callsite Call. 3875 if (Callee == Clone || !Callee->hasCall()) 3876 continue; 3877 ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge); 3878 removeNoneTypeCalleeEdges(NewClone); 3879 // Moving the edge may have resulted in some none type 3880 // callee edges on the original Callee. 3881 removeNoneTypeCalleeEdges(Callee); 3882 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None); 3883 // If the Callee node was already assigned to call a specific 3884 // function version, make sure its new clone is assigned to call 3885 // that same function clone. 3886 if (CallsiteToCalleeFuncCloneMap.count(Callee)) 3887 RecordCalleeFuncOfCallsite( 3888 NewClone, CallsiteToCalleeFuncCloneMap[Callee]); 3889 // Update NewClone with the new Call clone of this callsite's Call 3890 // created for the new function clone created earlier. 3891 // Recall that we have already ensured when building the graph 3892 // that each caller can only call callsites within the same 3893 // function, so we are guaranteed that Callee Call is in the 3894 // current OrigFunc. 3895 // CallMap is set up as indexed by original Call at clone 0. 3896 CallInfo OrigCall(Callee->getOrigNode()->Call); 3897 OrigCall.setCloneNo(0); 3898 std::map<CallInfo, CallInfo> &CallMap = 3899 FuncClonesToCallMap[NewFuncClone]; 3900 assert(CallMap.count(OrigCall)); 3901 CallInfo NewCall(CallMap[OrigCall]); 3902 assert(NewCall); 3903 NewClone->setCall(NewCall); 3904 // Need to do the same for all matching calls. 3905 for (auto &MatchingCall : NewClone->MatchingCalls) { 3906 CallInfo OrigMatchingCall(MatchingCall); 3907 OrigMatchingCall.setCloneNo(0); 3908 assert(CallMap.count(OrigMatchingCall)); 3909 CallInfo NewCall(CallMap[OrigMatchingCall]); 3910 assert(NewCall); 3911 // Updates the call in the list. 3912 MatchingCall = NewCall; 3913 } 3914 } 3915 } 3916 // Fall through to handling below to perform the recording of the 3917 // function for this callsite clone. This enables handling of cases 3918 // where the callers were assigned to different clones of a function. 3919 } 3920 3921 // See if we can use existing function clone. Walk through 3922 // all caller edges to see if any have already been assigned to 3923 // a clone of this callsite's function. If we can use it, do so. If not, 3924 // because that function clone is already assigned to a different clone 3925 // of this callsite, then we need to clone again. 3926 // Basically, this checking is needed to handle the case where different 3927 // caller functions/callsites may need versions of this function 3928 // containing different mixes of callsite clones across the different 3929 // callsites within the function. If that happens, we need to create 3930 // additional function clones to handle the various combinations. 3931 // 3932 // Keep track of any new clones of this callsite created by the 3933 // following loop, as well as any existing clone that we decided to 3934 // assign this clone to. 3935 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap; 3936 FuncInfo FuncCloneAssignedToCurCallsiteClone; 3937 // Iterate over a copy of Clone's caller edges, since we may need to 3938 // remove edges in the moveEdgeTo* methods, and this simplifies the 3939 // handling and makes it less error-prone. 3940 auto CloneCallerEdges = Clone->CallerEdges; 3941 for (auto &Edge : CloneCallerEdges) { 3942 // Ignore any caller that does not have a recorded callsite Call. 3943 if (!Edge->Caller->hasCall()) 3944 continue; 3945 // If this caller already assigned to call a version of OrigFunc, need 3946 // to ensure we can assign this callsite clone to that function clone. 3947 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) { 3948 FuncInfo FuncCloneCalledByCaller = 3949 CallsiteToCalleeFuncCloneMap[Edge->Caller]; 3950 // First we need to confirm that this function clone is available 3951 // for use by this callsite node clone. 3952 // 3953 // While FuncCloneToCurNodeCloneMap is built only for this Node and 3954 // its callsite clones, one of those callsite clones X could have 3955 // been assigned to the same function clone called by Edge's caller 3956 // - if Edge's caller calls another callsite within Node's original 3957 // function, and that callsite has another caller reaching clone X. 3958 // We need to clone Node again in this case. 3959 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) && 3960 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] != 3961 Clone) || 3962 // Detect when we have multiple callers of this callsite that 3963 // have already been assigned to specific, and different, clones 3964 // of OrigFunc (due to other unrelated callsites in Func they 3965 // reach via call contexts). Is this Clone of callsite Node 3966 // assigned to a different clone of OrigFunc? If so, clone Node 3967 // again. 3968 (FuncCloneAssignedToCurCallsiteClone && 3969 FuncCloneAssignedToCurCallsiteClone != 3970 FuncCloneCalledByCaller)) { 3971 // We need to use a different newly created callsite clone, in 3972 // order to assign it to another new function clone on a 3973 // subsequent iteration over the Clones array (adjusted below). 3974 // Note we specifically do not reset the 3975 // CallsiteToCalleeFuncCloneMap entry for this caller, so that 3976 // when this new clone is processed later we know which version of 3977 // the function to copy (so that other callsite clones we have 3978 // assigned to that function clone are properly cloned over). See 3979 // comments in the function cloning handling earlier. 3980 3981 // Check if we already have cloned this callsite again while 3982 // walking through caller edges, for a caller calling the same 3983 // function clone. If so, we can move this edge to that new clone 3984 // rather than creating yet another new clone. 3985 if (FuncCloneToNewCallsiteCloneMap.count( 3986 FuncCloneCalledByCaller)) { 3987 ContextNode *NewClone = 3988 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller]; 3989 moveEdgeToExistingCalleeClone(Edge, NewClone); 3990 // Cleanup any none type edges cloned over. 3991 removeNoneTypeCalleeEdges(NewClone); 3992 } else { 3993 // Create a new callsite clone. 3994 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge); 3995 removeNoneTypeCalleeEdges(NewClone); 3996 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] = 3997 NewClone; 3998 // Add to list of clones and process later. 3999 ClonesWorklist.push_back(NewClone); 4000 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None); 4001 } 4002 // Moving the caller edge may have resulted in some none type 4003 // callee edges. 4004 removeNoneTypeCalleeEdges(Clone); 4005 // We will handle the newly created callsite clone in a subsequent 4006 // iteration over this Node's Clones. 4007 continue; 4008 } 4009 4010 // Otherwise, we can use the function clone already assigned to this 4011 // caller. 4012 if (!FuncCloneAssignedToCurCallsiteClone) { 4013 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller; 4014 // Assign Clone to FuncCloneCalledByCaller 4015 AssignCallsiteCloneToFuncClone( 4016 FuncCloneCalledByCaller, Call, Clone, 4017 AllocationCallToContextNodeMap.count(Call)); 4018 } else 4019 // Don't need to do anything - callsite is already calling this 4020 // function clone. 4021 assert(FuncCloneAssignedToCurCallsiteClone == 4022 FuncCloneCalledByCaller); 4023 4024 } else { 4025 // We have not already assigned this caller to a version of 4026 // OrigFunc. Do the assignment now. 4027 4028 // First check if we have already assigned this callsite clone to a 4029 // clone of OrigFunc for another caller during this iteration over 4030 // its caller edges. 4031 if (!FuncCloneAssignedToCurCallsiteClone) { 4032 // Find first function in FuncClonesToCallMap without an assigned 4033 // clone of this callsite Node. We should always have one 4034 // available at this point due to the earlier cloning when the 4035 // FuncClonesToCallMap size was smaller than the clone number. 4036 for (auto &CF : FuncClonesToCallMap) { 4037 if (!FuncCloneToCurNodeCloneMap.count(CF.first)) { 4038 FuncCloneAssignedToCurCallsiteClone = CF.first; 4039 break; 4040 } 4041 } 4042 assert(FuncCloneAssignedToCurCallsiteClone); 4043 // Assign Clone to FuncCloneAssignedToCurCallsiteClone 4044 AssignCallsiteCloneToFuncClone( 4045 FuncCloneAssignedToCurCallsiteClone, Call, Clone, 4046 AllocationCallToContextNodeMap.count(Call)); 4047 } else 4048 assert(FuncCloneToCurNodeCloneMap 4049 [FuncCloneAssignedToCurCallsiteClone] == Clone); 4050 // Update callers to record function version called. 4051 RecordCalleeFuncOfCallsite(Edge->Caller, 4052 FuncCloneAssignedToCurCallsiteClone); 4053 } 4054 } 4055 } 4056 if (VerifyCCG) { 4057 checkNode<DerivedCCG, FuncTy, CallTy>(Node); 4058 for (const auto &PE : Node->CalleeEdges) 4059 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee); 4060 for (const auto &CE : Node->CallerEdges) 4061 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller); 4062 for (auto *Clone : Node->Clones) { 4063 checkNode<DerivedCCG, FuncTy, CallTy>(Clone); 4064 for (const auto &PE : Clone->CalleeEdges) 4065 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee); 4066 for (const auto &CE : Clone->CallerEdges) 4067 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller); 4068 } 4069 } 4070 } 4071 } 4072 4073 uint8_t BothTypes = 4074 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; 4075 4076 auto UpdateCalls = [&](ContextNode *Node, 4077 DenseSet<const ContextNode *> &Visited, 4078 auto &&UpdateCalls) { 4079 auto Inserted = Visited.insert(Node); 4080 if (!Inserted.second) 4081 return; 4082 4083 for (auto *Clone : Node->Clones) 4084 UpdateCalls(Clone, Visited, UpdateCalls); 4085 4086 for (auto &Edge : Node->CallerEdges) 4087 UpdateCalls(Edge->Caller, Visited, UpdateCalls); 4088 4089 // Skip if either no call to update, or if we ended up with no context ids 4090 // (we moved all edges onto other clones). 4091 if (!Node->hasCall() || Node->emptyContextIds()) 4092 return; 4093 4094 if (Node->IsAllocation) { 4095 auto AT = allocTypeToUse(Node->AllocTypes); 4096 // If the allocation type is ambiguous, and more aggressive hinting 4097 // has been enabled via the MinClonedColdBytePercent flag, see if this 4098 // allocation should be hinted cold anyway because its fraction cold bytes 4099 // allocated is at least the given threshold. 4100 if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 && 4101 !ContextIdToContextSizeInfos.empty()) { 4102 uint64_t TotalCold = 0; 4103 uint64_t Total = 0; 4104 for (auto Id : Node->getContextIds()) { 4105 auto TypeI = ContextIdToAllocationType.find(Id); 4106 assert(TypeI != ContextIdToAllocationType.end()); 4107 auto CSI = ContextIdToContextSizeInfos.find(Id); 4108 if (CSI != ContextIdToContextSizeInfos.end()) { 4109 for (auto &Info : CSI->second) { 4110 Total += Info.TotalSize; 4111 if (TypeI->second == AllocationType::Cold) 4112 TotalCold += Info.TotalSize; 4113 } 4114 } 4115 } 4116 if (TotalCold * 100 >= Total * MinClonedColdBytePercent) 4117 AT = AllocationType::Cold; 4118 } 4119 updateAllocationCall(Node->Call, AT); 4120 assert(Node->MatchingCalls.empty()); 4121 return; 4122 } 4123 4124 if (!CallsiteToCalleeFuncCloneMap.count(Node)) 4125 return; 4126 4127 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node]; 4128 updateCall(Node->Call, CalleeFunc); 4129 // Update all the matching calls as well. 4130 for (auto &Call : Node->MatchingCalls) 4131 updateCall(Call, CalleeFunc); 4132 }; 4133 4134 // Performs DFS traversal starting from allocation nodes to update calls to 4135 // reflect cloning decisions recorded earlier. For regular LTO this will 4136 // update the actual calls in the IR to call the appropriate function clone 4137 // (and add attributes to allocation calls), whereas for ThinLTO the decisions 4138 // are recorded in the summary entries. 4139 DenseSet<const ContextNode *> Visited; 4140 for (auto &Entry : AllocationCallToContextNodeMap) 4141 UpdateCalls(Entry.second, Visited, UpdateCalls); 4142 4143 return Changed; 4144 } 4145 4146 static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones( 4147 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, 4148 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>> 4149 &FuncToAliasMap) { 4150 // The first "clone" is the original copy, we should only call this if we 4151 // needed to create new clones. 4152 assert(NumClones > 1); 4153 SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps; 4154 VMaps.reserve(NumClones - 1); 4155 FunctionsClonedThinBackend++; 4156 for (unsigned I = 1; I < NumClones; I++) { 4157 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>()); 4158 auto *NewF = CloneFunction(&F, *VMaps.back()); 4159 FunctionClonesThinBackend++; 4160 // Strip memprof and callsite metadata from clone as they are no longer 4161 // needed. 4162 for (auto &BB : *NewF) { 4163 for (auto &Inst : BB) { 4164 Inst.setMetadata(LLVMContext::MD_memprof, nullptr); 4165 Inst.setMetadata(LLVMContext::MD_callsite, nullptr); 4166 } 4167 } 4168 std::string Name = getMemProfFuncName(F.getName(), I); 4169 auto *PrevF = M.getFunction(Name); 4170 if (PrevF) { 4171 // We might have created this when adjusting callsite in another 4172 // function. It should be a declaration. 4173 assert(PrevF->isDeclaration()); 4174 NewF->takeName(PrevF); 4175 PrevF->replaceAllUsesWith(NewF); 4176 PrevF->eraseFromParent(); 4177 } else 4178 NewF->setName(Name); 4179 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F) 4180 << "created clone " << ore::NV("NewFunction", NewF)); 4181 4182 // Now handle aliases to this function, and clone those as well. 4183 if (!FuncToAliasMap.count(&F)) 4184 continue; 4185 for (auto *A : FuncToAliasMap[&F]) { 4186 std::string Name = getMemProfFuncName(A->getName(), I); 4187 auto *PrevA = M.getNamedAlias(Name); 4188 auto *NewA = GlobalAlias::create(A->getValueType(), 4189 A->getType()->getPointerAddressSpace(), 4190 A->getLinkage(), Name, NewF); 4191 NewA->copyAttributesFrom(A); 4192 if (PrevA) { 4193 // We might have created this when adjusting callsite in another 4194 // function. It should be a declaration. 4195 assert(PrevA->isDeclaration()); 4196 NewA->takeName(PrevA); 4197 PrevA->replaceAllUsesWith(NewA); 4198 PrevA->eraseFromParent(); 4199 } 4200 } 4201 } 4202 return VMaps; 4203 } 4204 4205 // Locate the summary for F. This is complicated by the fact that it might 4206 // have been internalized or promoted. 4207 static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, 4208 const ModuleSummaryIndex *ImportSummary) { 4209 // FIXME: Ideally we would retain the original GUID in some fashion on the 4210 // function (e.g. as metadata), but for now do our best to locate the 4211 // summary without that information. 4212 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID()); 4213 if (!TheFnVI) 4214 // See if theFn was internalized, by checking index directly with 4215 // original name (this avoids the name adjustment done by getGUID() for 4216 // internal symbols). 4217 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName())); 4218 if (TheFnVI) 4219 return TheFnVI; 4220 // Now query with the original name before any promotion was performed. 4221 StringRef OrigName = 4222 ModuleSummaryIndex::getOriginalNameBeforePromote(F.getName()); 4223 std::string OrigId = GlobalValue::getGlobalIdentifier( 4224 OrigName, GlobalValue::InternalLinkage, M.getSourceFileName()); 4225 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId)); 4226 if (TheFnVI) 4227 return TheFnVI; 4228 // Could be a promoted local imported from another module. We need to pass 4229 // down more info here to find the original module id. For now, try with 4230 // the OrigName which might have been stored in the OidGuidMap in the 4231 // index. This would not work if there were same-named locals in multiple 4232 // modules, however. 4233 auto OrigGUID = 4234 ImportSummary->getGUIDFromOriginalID(GlobalValue::getGUID(OrigName)); 4235 if (OrigGUID) 4236 TheFnVI = ImportSummary->getValueInfo(OrigGUID); 4237 return TheFnVI; 4238 } 4239 4240 bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo( 4241 Module &M) { 4242 ICallAnalysis = std::make_unique<ICallPromotionAnalysis>(); 4243 Symtab = std::make_unique<InstrProfSymtab>(); 4244 // Don't add canonical names, to avoid multiple functions to the symtab 4245 // when they both have the same root name with "." suffixes stripped. 4246 // If we pick the wrong one then this could lead to incorrect ICP and calling 4247 // a memprof clone that we don't actually create (resulting in linker unsats). 4248 // What this means is that the GUID of the function (or its PGOFuncName 4249 // metadata) *must* match that in the VP metadata to allow promotion. 4250 // In practice this should not be a limitation, since local functions should 4251 // have PGOFuncName metadata and global function names shouldn't need any 4252 // special handling (they should not get the ".llvm.*" suffix that the 4253 // canonicalization handling is attempting to strip). 4254 if (Error E = Symtab->create(M, /*InLTO=*/true, /*AddCanonical=*/false)) { 4255 std::string SymtabFailure = toString(std::move(E)); 4256 M.getContext().emitError("Failed to create symtab: " + SymtabFailure); 4257 return false; 4258 } 4259 return true; 4260 } 4261 4262 bool MemProfContextDisambiguation::applyImport(Module &M) { 4263 assert(ImportSummary); 4264 bool Changed = false; 4265 4266 // We also need to clone any aliases that reference cloned functions, because 4267 // the modified callsites may invoke via the alias. Keep track of the aliases 4268 // for each function. 4269 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>> 4270 FuncToAliasMap; 4271 for (auto &A : M.aliases()) { 4272 auto *Aliasee = A.getAliaseeObject(); 4273 if (auto *F = dyn_cast<Function>(Aliasee)) 4274 FuncToAliasMap[F].insert(&A); 4275 } 4276 4277 if (!initializeIndirectCallPromotionInfo(M)) 4278 return false; 4279 4280 for (auto &F : M) { 4281 if (F.isDeclaration() || isMemProfClone(F)) 4282 continue; 4283 4284 OptimizationRemarkEmitter ORE(&F); 4285 4286 SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps; 4287 bool ClonesCreated = false; 4288 unsigned NumClonesCreated = 0; 4289 auto CloneFuncIfNeeded = [&](unsigned NumClones) { 4290 // We should at least have version 0 which is the original copy. 4291 assert(NumClones > 0); 4292 // If only one copy needed use original. 4293 if (NumClones == 1) 4294 return; 4295 // If we already performed cloning of this function, confirm that the 4296 // requested number of clones matches (the thin link should ensure the 4297 // number of clones for each constituent callsite is consistent within 4298 // each function), before returning. 4299 if (ClonesCreated) { 4300 assert(NumClonesCreated == NumClones); 4301 return; 4302 } 4303 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap); 4304 // The first "clone" is the original copy, which doesn't have a VMap. 4305 assert(VMaps.size() == NumClones - 1); 4306 Changed = true; 4307 ClonesCreated = true; 4308 NumClonesCreated = NumClones; 4309 }; 4310 4311 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB, 4312 Function *CalledFunction) { 4313 // Perform cloning if not yet done. 4314 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size()); 4315 4316 assert(!isMemProfClone(*CalledFunction)); 4317 4318 // Update the calls per the summary info. 4319 // Save orig name since it gets updated in the first iteration 4320 // below. 4321 auto CalleeOrigName = CalledFunction->getName(); 4322 for (unsigned J = 0; J < StackNode.Clones.size(); J++) { 4323 // Do nothing if this version calls the original version of its 4324 // callee. 4325 if (!StackNode.Clones[J]) 4326 continue; 4327 auto NewF = M.getOrInsertFunction( 4328 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]), 4329 CalledFunction->getFunctionType()); 4330 CallBase *CBClone; 4331 // Copy 0 is the original function. 4332 if (!J) 4333 CBClone = CB; 4334 else 4335 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]); 4336 CBClone->setCalledFunction(NewF); 4337 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone) 4338 << ore::NV("Call", CBClone) << " in clone " 4339 << ore::NV("Caller", CBClone->getFunction()) 4340 << " assigned to call function clone " 4341 << ore::NV("Callee", NewF.getCallee())); 4342 } 4343 }; 4344 4345 // Locate the summary for F. 4346 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary); 4347 // If not found, this could be an imported local (see comment in 4348 // findValueInfoForFunc). Skip for now as it will be cloned in its original 4349 // module (where it would have been promoted to global scope so should 4350 // satisfy any reference in this module). 4351 if (!TheFnVI) 4352 continue; 4353 4354 auto *GVSummary = 4355 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier()); 4356 if (!GVSummary) { 4357 // Must have been imported, use the summary which matches the definition。 4358 // (might be multiple if this was a linkonce_odr). 4359 auto SrcModuleMD = F.getMetadata("thinlto_src_module"); 4360 assert(SrcModuleMD && 4361 "enable-import-metadata is needed to emit thinlto_src_module"); 4362 StringRef SrcModule = 4363 dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString(); 4364 for (auto &GVS : TheFnVI.getSummaryList()) { 4365 if (GVS->modulePath() == SrcModule) { 4366 GVSummary = GVS.get(); 4367 break; 4368 } 4369 } 4370 assert(GVSummary && GVSummary->modulePath() == SrcModule); 4371 } 4372 4373 // If this was an imported alias skip it as we won't have the function 4374 // summary, and it should be cloned in the original module. 4375 if (isa<AliasSummary>(GVSummary)) 4376 continue; 4377 4378 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject()); 4379 4380 if (FS->allocs().empty() && FS->callsites().empty()) 4381 continue; 4382 4383 auto SI = FS->callsites().begin(); 4384 auto AI = FS->allocs().begin(); 4385 4386 // To handle callsite infos synthesized for tail calls which have missing 4387 // frames in the profiled context, map callee VI to the synthesized callsite 4388 // info. 4389 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite; 4390 // Iterate the callsites for this function in reverse, since we place all 4391 // those synthesized for tail calls at the end. 4392 for (auto CallsiteIt = FS->callsites().rbegin(); 4393 CallsiteIt != FS->callsites().rend(); CallsiteIt++) { 4394 auto &Callsite = *CallsiteIt; 4395 // Stop as soon as we see a non-synthesized callsite info (see comment 4396 // above loop). All the entries added for discovered tail calls have empty 4397 // stack ids. 4398 if (!Callsite.StackIdIndices.empty()) 4399 break; 4400 MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite}); 4401 } 4402 4403 // Keeps track of needed ICP for the function. 4404 SmallVector<ICallAnalysisData> ICallAnalysisInfo; 4405 4406 // Assume for now that the instructions are in the exact same order 4407 // as when the summary was created, but confirm this is correct by 4408 // matching the stack ids. 4409 for (auto &BB : F) { 4410 for (auto &I : BB) { 4411 auto *CB = dyn_cast<CallBase>(&I); 4412 // Same handling as when creating module summary. 4413 if (!mayHaveMemprofSummary(CB)) 4414 continue; 4415 4416 auto *CalledValue = CB->getCalledOperand(); 4417 auto *CalledFunction = CB->getCalledFunction(); 4418 if (CalledValue && !CalledFunction) { 4419 CalledValue = CalledValue->stripPointerCasts(); 4420 // Stripping pointer casts can reveal a called function. 4421 CalledFunction = dyn_cast<Function>(CalledValue); 4422 } 4423 // Check if this is an alias to a function. If so, get the 4424 // called aliasee for the checks below. 4425 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) { 4426 assert(!CalledFunction && 4427 "Expected null called function in callsite for alias"); 4428 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject()); 4429 } 4430 4431 CallStack<MDNode, MDNode::op_iterator> CallsiteContext( 4432 I.getMetadata(LLVMContext::MD_callsite)); 4433 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof); 4434 4435 // Include allocs that were already assigned a memprof function 4436 // attribute in the statistics. 4437 if (CB->getAttributes().hasFnAttr("memprof")) { 4438 assert(!MemProfMD); 4439 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold" 4440 ? AllocTypeColdThinBackend++ 4441 : AllocTypeNotColdThinBackend++; 4442 OrigAllocsThinBackend++; 4443 AllocVersionsThinBackend++; 4444 if (!MaxAllocVersionsThinBackend) 4445 MaxAllocVersionsThinBackend = 1; 4446 continue; 4447 } 4448 4449 if (MemProfMD) { 4450 // Consult the next alloc node. 4451 assert(AI != FS->allocs().end()); 4452 auto &AllocNode = *(AI++); 4453 4454 // Sanity check that the MIB stack ids match between the summary and 4455 // instruction metadata. 4456 auto MIBIter = AllocNode.MIBs.begin(); 4457 for (auto &MDOp : MemProfMD->operands()) { 4458 assert(MIBIter != AllocNode.MIBs.end()); 4459 LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter = 4460 MIBIter->StackIdIndices.begin(); 4461 auto *MIBMD = cast<const MDNode>(MDOp); 4462 MDNode *StackMDNode = getMIBStackNode(MIBMD); 4463 assert(StackMDNode); 4464 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode); 4465 auto ContextIterBegin = 4466 StackContext.beginAfterSharedPrefix(CallsiteContext); 4467 // Skip the checking on the first iteration. 4468 uint64_t LastStackContextId = 4469 (ContextIterBegin != StackContext.end() && 4470 *ContextIterBegin == 0) 4471 ? 1 4472 : 0; 4473 for (auto ContextIter = ContextIterBegin; 4474 ContextIter != StackContext.end(); ++ContextIter) { 4475 // If this is a direct recursion, simply skip the duplicate 4476 // entries, to be consistent with how the summary ids were 4477 // generated during ModuleSummaryAnalysis. 4478 if (LastStackContextId == *ContextIter) 4479 continue; 4480 LastStackContextId = *ContextIter; 4481 assert(StackIdIndexIter != MIBIter->StackIdIndices.end()); 4482 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) == 4483 *ContextIter); 4484 StackIdIndexIter++; 4485 } 4486 MIBIter++; 4487 } 4488 4489 // Perform cloning if not yet done. 4490 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size()); 4491 4492 OrigAllocsThinBackend++; 4493 AllocVersionsThinBackend += AllocNode.Versions.size(); 4494 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size()) 4495 MaxAllocVersionsThinBackend = AllocNode.Versions.size(); 4496 4497 // If there is only one version that means we didn't end up 4498 // considering this function for cloning, and in that case the alloc 4499 // will still be none type or should have gotten the default NotCold. 4500 // Skip that after calling clone helper since that does some sanity 4501 // checks that confirm we haven't decided yet that we need cloning. 4502 // We might have a single version that is cold due to the 4503 // MinClonedColdBytePercent heuristic, make sure we don't skip in that 4504 // case. 4505 if (AllocNode.Versions.size() == 1 && 4506 (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) { 4507 assert((AllocationType)AllocNode.Versions[0] == 4508 AllocationType::NotCold || 4509 (AllocationType)AllocNode.Versions[0] == 4510 AllocationType::None); 4511 UnclonableAllocsThinBackend++; 4512 continue; 4513 } 4514 4515 // All versions should have a singular allocation type. 4516 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) { 4517 return Type == ((uint8_t)AllocationType::NotCold | 4518 (uint8_t)AllocationType::Cold); 4519 })); 4520 4521 // Update the allocation types per the summary info. 4522 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) { 4523 // Ignore any that didn't get an assigned allocation type. 4524 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None) 4525 continue; 4526 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J]; 4527 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++ 4528 : AllocTypeNotColdThinBackend++; 4529 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy); 4530 auto A = llvm::Attribute::get(F.getContext(), "memprof", 4531 AllocTypeString); 4532 CallBase *CBClone; 4533 // Copy 0 is the original function. 4534 if (!J) 4535 CBClone = CB; 4536 else 4537 // Since VMaps are only created for new clones, we index with 4538 // clone J-1 (J==0 is the original clone and does not have a VMaps 4539 // entry). 4540 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]); 4541 CBClone->addFnAttr(A); 4542 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone) 4543 << ore::NV("AllocationCall", CBClone) << " in clone " 4544 << ore::NV("Caller", CBClone->getFunction()) 4545 << " marked with memprof allocation attribute " 4546 << ore::NV("Attribute", AllocTypeString)); 4547 } 4548 } else if (!CallsiteContext.empty()) { 4549 if (!CalledFunction) { 4550 #ifndef NDEBUG 4551 // We should have skipped inline assembly calls. 4552 auto *CI = dyn_cast<CallInst>(CB); 4553 assert(!CI || !CI->isInlineAsm()); 4554 #endif 4555 // We should have skipped direct calls via a Constant. 4556 assert(CalledValue && !isa<Constant>(CalledValue)); 4557 4558 // This is an indirect call, see if we have profile information and 4559 // whether any clones were recorded for the profiled targets (that 4560 // we synthesized CallsiteInfo summary records for when building the 4561 // index). 4562 auto NumClones = 4563 recordICPInfo(CB, FS->callsites(), SI, ICallAnalysisInfo); 4564 4565 // Perform cloning if not yet done. This is done here in case 4566 // we don't need to do ICP, but might need to clone this 4567 // function as it is the target of other cloned calls. 4568 if (NumClones) 4569 CloneFuncIfNeeded(NumClones); 4570 } 4571 4572 else { 4573 // Consult the next callsite node. 4574 assert(SI != FS->callsites().end()); 4575 auto &StackNode = *(SI++); 4576 4577 #ifndef NDEBUG 4578 // Sanity check that the stack ids match between the summary and 4579 // instruction metadata. 4580 auto StackIdIndexIter = StackNode.StackIdIndices.begin(); 4581 for (auto StackId : CallsiteContext) { 4582 assert(StackIdIndexIter != StackNode.StackIdIndices.end()); 4583 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) == 4584 StackId); 4585 StackIdIndexIter++; 4586 } 4587 #endif 4588 4589 CloneCallsite(StackNode, CB, CalledFunction); 4590 } 4591 } else if (CB->isTailCall() && CalledFunction) { 4592 // Locate the synthesized callsite info for the callee VI, if any was 4593 // created, and use that for cloning. 4594 ValueInfo CalleeVI = 4595 findValueInfoForFunc(*CalledFunction, M, ImportSummary); 4596 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) { 4597 auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI); 4598 assert(Callsite != MapTailCallCalleeVIToCallsite.end()); 4599 CloneCallsite(Callsite->second, CB, CalledFunction); 4600 } 4601 } 4602 } 4603 } 4604 4605 // Now do any promotion required for cloning. 4606 performICP(M, FS->callsites(), VMaps, ICallAnalysisInfo, ORE); 4607 } 4608 4609 // We skip some of the functions and instructions above, so remove all the 4610 // metadata in a single sweep here. 4611 for (auto &F : M) { 4612 // We can skip memprof clones because createFunctionClones already strips 4613 // the metadata from the newly created clones. 4614 if (F.isDeclaration() || isMemProfClone(F)) 4615 continue; 4616 for (auto &BB : F) { 4617 for (auto &I : BB) { 4618 if (!isa<CallBase>(I)) 4619 continue; 4620 I.setMetadata(LLVMContext::MD_memprof, nullptr); 4621 I.setMetadata(LLVMContext::MD_callsite, nullptr); 4622 } 4623 } 4624 } 4625 4626 return Changed; 4627 } 4628 4629 unsigned MemProfContextDisambiguation::recordICPInfo( 4630 CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites, 4631 ArrayRef<CallsiteInfo>::iterator &SI, 4632 SmallVector<ICallAnalysisData> &ICallAnalysisInfo) { 4633 // First see if we have profile information for this indirect call. 4634 uint32_t NumCandidates; 4635 uint64_t TotalCount; 4636 auto CandidateProfileData = 4637 ICallAnalysis->getPromotionCandidatesForInstruction(CB, TotalCount, 4638 NumCandidates); 4639 if (CandidateProfileData.empty()) 4640 return 0; 4641 4642 // Iterate through all of the candidate profiled targets along with the 4643 // CallsiteInfo summary records synthesized for them when building the index, 4644 // and see if any are cloned and/or refer to clones. 4645 bool ICPNeeded = false; 4646 unsigned NumClones = 0; 4647 size_t CallsiteInfoStartIndex = std::distance(AllCallsites.begin(), SI); 4648 for (const auto &Candidate : CandidateProfileData) { 4649 #ifndef NDEBUG 4650 auto CalleeValueInfo = 4651 #endif 4652 ImportSummary->getValueInfo(Candidate.Value); 4653 // We might not have a ValueInfo if this is a distributed 4654 // ThinLTO backend and decided not to import that function. 4655 assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo); 4656 assert(SI != AllCallsites.end()); 4657 auto &StackNode = *(SI++); 4658 // See if any of the clones of the indirect callsite for this 4659 // profiled target should call a cloned version of the profiled 4660 // target. We only need to do the ICP here if so. 4661 ICPNeeded |= llvm::any_of(StackNode.Clones, 4662 [](unsigned CloneNo) { return CloneNo != 0; }); 4663 // Every callsite in the same function should have been cloned the same 4664 // number of times. 4665 assert(!NumClones || NumClones == StackNode.Clones.size()); 4666 NumClones = StackNode.Clones.size(); 4667 } 4668 if (!ICPNeeded) 4669 return NumClones; 4670 // Save information for ICP, which is performed later to avoid messing up the 4671 // current function traversal. 4672 ICallAnalysisInfo.push_back({CB, CandidateProfileData.vec(), NumCandidates, 4673 TotalCount, CallsiteInfoStartIndex}); 4674 return NumClones; 4675 } 4676 4677 void MemProfContextDisambiguation::performICP( 4678 Module &M, ArrayRef<CallsiteInfo> AllCallsites, 4679 ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps, 4680 ArrayRef<ICallAnalysisData> ICallAnalysisInfo, 4681 OptimizationRemarkEmitter &ORE) { 4682 // Now do any promotion required for cloning. Specifically, for each 4683 // recorded ICP candidate (which was only recorded because one clone of that 4684 // candidate should call a cloned target), we perform ICP (speculative 4685 // devirtualization) for each clone of the callsite, and update its callee 4686 // to the appropriate clone. Note that the ICP compares against the original 4687 // version of the target, which is what is in the vtable. 4688 for (auto &Info : ICallAnalysisInfo) { 4689 auto *CB = Info.CB; 4690 auto CallsiteIndex = Info.CallsiteInfoStartIndex; 4691 auto TotalCount = Info.TotalCount; 4692 unsigned NumPromoted = 0; 4693 unsigned NumClones = 0; 4694 4695 for (auto &Candidate : Info.CandidateProfileData) { 4696 auto &StackNode = AllCallsites[CallsiteIndex++]; 4697 4698 // All calls in the same function must have the same number of clones. 4699 assert(!NumClones || NumClones == StackNode.Clones.size()); 4700 NumClones = StackNode.Clones.size(); 4701 4702 // See if the target is in the module. If it wasn't imported, it is 4703 // possible that this profile could have been collected on a different 4704 // target (or version of the code), and we need to be conservative 4705 // (similar to what is done in the ICP pass). 4706 Function *TargetFunction = Symtab->getFunction(Candidate.Value); 4707 if (TargetFunction == nullptr || 4708 // Any ThinLTO global dead symbol removal should have already 4709 // occurred, so it should be safe to promote when the target is a 4710 // declaration. 4711 // TODO: Remove internal option once more fully tested. 4712 (MemProfRequireDefinitionForPromotion && 4713 TargetFunction->isDeclaration())) { 4714 ORE.emit([&]() { 4715 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", CB) 4716 << "Memprof cannot promote indirect call: target with md5sum " 4717 << ore::NV("target md5sum", Candidate.Value) << " not found"; 4718 }); 4719 // FIXME: See if we can use the new declaration importing support to 4720 // at least get the declarations imported for this case. Hot indirect 4721 // targets should have been imported normally, however. 4722 continue; 4723 } 4724 4725 // Check if legal to promote 4726 const char *Reason = nullptr; 4727 if (!isLegalToPromote(*CB, TargetFunction, &Reason)) { 4728 ORE.emit([&]() { 4729 return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB) 4730 << "Memprof cannot promote indirect call to " 4731 << ore::NV("TargetFunction", TargetFunction) 4732 << " with count of " << ore::NV("TotalCount", TotalCount) 4733 << ": " << Reason; 4734 }); 4735 continue; 4736 } 4737 4738 assert(!isMemProfClone(*TargetFunction)); 4739 4740 // Handle each call clone, applying ICP so that each clone directly 4741 // calls the specified callee clone, guarded by the appropriate ICP 4742 // check. 4743 CallBase *CBClone = CB; 4744 for (unsigned J = 0; J < NumClones; J++) { 4745 // Copy 0 is the original function. 4746 if (J > 0) 4747 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]); 4748 // We do the promotion using the original name, so that the comparison 4749 // is against the name in the vtable. Then just below, change the new 4750 // direct call to call the cloned function. 4751 auto &DirectCall = 4752 pgo::promoteIndirectCall(*CBClone, TargetFunction, Candidate.Count, 4753 TotalCount, isSamplePGO, &ORE); 4754 auto *TargetToUse = TargetFunction; 4755 // Call original if this version calls the original version of its 4756 // callee. 4757 if (StackNode.Clones[J]) { 4758 TargetToUse = 4759 cast<Function>(M.getOrInsertFunction( 4760 getMemProfFuncName(TargetFunction->getName(), 4761 StackNode.Clones[J]), 4762 TargetFunction->getFunctionType()) 4763 .getCallee()); 4764 } 4765 DirectCall.setCalledFunction(TargetToUse); 4766 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone) 4767 << ore::NV("Call", CBClone) << " in clone " 4768 << ore::NV("Caller", CBClone->getFunction()) 4769 << " promoted and assigned to call function clone " 4770 << ore::NV("Callee", TargetToUse)); 4771 } 4772 4773 // Update TotalCount (all clones should get same count above) 4774 TotalCount -= Candidate.Count; 4775 NumPromoted++; 4776 } 4777 // Adjust the MD.prof metadata for all clones, now that we have the new 4778 // TotalCount and the number promoted. 4779 CallBase *CBClone = CB; 4780 for (unsigned J = 0; J < NumClones; J++) { 4781 // Copy 0 is the original function. 4782 if (J > 0) 4783 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]); 4784 // First delete the old one. 4785 CBClone->setMetadata(LLVMContext::MD_prof, nullptr); 4786 // If all promoted, we don't need the MD.prof metadata. 4787 // Otherwise we need update with the un-promoted records back. 4788 if (TotalCount != 0) 4789 annotateValueSite( 4790 M, *CBClone, ArrayRef(Info.CandidateProfileData).slice(NumPromoted), 4791 TotalCount, IPVK_IndirectCallTarget, Info.NumCandidates); 4792 } 4793 } 4794 } 4795 4796 template <typename DerivedCCG, typename FuncTy, typename CallTy> 4797 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() { 4798 if (DumpCCG) { 4799 dbgs() << "CCG before cloning:\n"; 4800 dbgs() << *this; 4801 } 4802 if (ExportToDot) 4803 exportToDot("postbuild"); 4804 4805 if (VerifyCCG) { 4806 check(); 4807 } 4808 4809 identifyClones(); 4810 4811 if (VerifyCCG) { 4812 check(); 4813 } 4814 4815 if (DumpCCG) { 4816 dbgs() << "CCG after cloning:\n"; 4817 dbgs() << *this; 4818 } 4819 if (ExportToDot) 4820 exportToDot("cloned"); 4821 4822 bool Changed = assignFunctions(); 4823 4824 if (DumpCCG) { 4825 dbgs() << "CCG after assigning function clones:\n"; 4826 dbgs() << *this; 4827 } 4828 if (ExportToDot) 4829 exportToDot("clonefuncassign"); 4830 4831 if (MemProfReportHintedSizes) 4832 printTotalSizes(errs()); 4833 4834 return Changed; 4835 } 4836 4837 bool MemProfContextDisambiguation::processModule( 4838 Module &M, 4839 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) { 4840 4841 // If we have an import summary, then the cloning decisions were made during 4842 // the thin link on the index. Apply them and return. 4843 if (ImportSummary) 4844 return applyImport(M); 4845 4846 // TODO: If/when other types of memprof cloning are enabled beyond just for 4847 // hot and cold, we will need to change this to individually control the 4848 // AllocationType passed to addStackNodesForMIB during CCG construction. 4849 // Note that we specifically check this after applying imports above, so that 4850 // the option isn't needed to be passed to distributed ThinLTO backend 4851 // clang processes, which won't necessarily have visibility into the linker 4852 // dependences. Instead the information is communicated from the LTO link to 4853 // the backends via the combined summary index. 4854 if (!SupportsHotColdNew) 4855 return false; 4856 4857 ModuleCallsiteContextGraph CCG(M, OREGetter); 4858 return CCG.process(); 4859 } 4860 4861 MemProfContextDisambiguation::MemProfContextDisambiguation( 4862 const ModuleSummaryIndex *Summary, bool isSamplePGO) 4863 : ImportSummary(Summary), isSamplePGO(isSamplePGO) { 4864 if (ImportSummary) { 4865 // The MemProfImportSummary should only be used for testing ThinLTO 4866 // distributed backend handling via opt, in which case we don't have a 4867 // summary from the pass pipeline. 4868 assert(MemProfImportSummary.empty()); 4869 return; 4870 } 4871 if (MemProfImportSummary.empty()) 4872 return; 4873 4874 auto ReadSummaryFile = 4875 errorOrToExpected(MemoryBuffer::getFile(MemProfImportSummary)); 4876 if (!ReadSummaryFile) { 4877 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(), 4878 "Error loading file '" + MemProfImportSummary + 4879 "': "); 4880 return; 4881 } 4882 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile); 4883 if (!ImportSummaryForTestingOrErr) { 4884 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(), 4885 "Error parsing file '" + MemProfImportSummary + 4886 "': "); 4887 return; 4888 } 4889 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr); 4890 ImportSummary = ImportSummaryForTesting.get(); 4891 } 4892 4893 PreservedAnalyses MemProfContextDisambiguation::run(Module &M, 4894 ModuleAnalysisManager &AM) { 4895 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 4896 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & { 4897 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F); 4898 }; 4899 if (!processModule(M, OREGetter)) 4900 return PreservedAnalyses::all(); 4901 return PreservedAnalyses::none(); 4902 } 4903 4904 void MemProfContextDisambiguation::run( 4905 ModuleSummaryIndex &Index, 4906 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> 4907 isPrevailing) { 4908 // TODO: If/when other types of memprof cloning are enabled beyond just for 4909 // hot and cold, we will need to change this to individually control the 4910 // AllocationType passed to addStackNodesForMIB during CCG construction. 4911 // The index was set from the option, so these should be in sync. 4912 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew); 4913 if (!SupportsHotColdNew) 4914 return; 4915 4916 IndexCallsiteContextGraph CCG(Index, isPrevailing); 4917 CCG.process(); 4918 } 4919