1 //==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements support for context disambiguation of allocation 10 // calls for profile guided heap optimization. Specifically, it uses Memprof 11 // profiles which indicate context specific allocation behavior (currently 12 // distinguishing cold vs hot memory allocations). Cloning is performed to 13 // expose the cold allocation call contexts, and the allocation calls are 14 // subsequently annotated with an attribute for later transformation. 15 // 16 // The transformations can be performed either directly on IR (regular LTO), or 17 // on a ThinLTO index (and later applied to the IR during the ThinLTO backend). 18 // Both types of LTO operate on a the same base graph representation, which 19 // uses CRTP to support either IR or Index formats. 20 // 21 //===----------------------------------------------------------------------===// 22 23 #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h" 24 #include "llvm/ADT/DenseMap.h" 25 #include "llvm/ADT/DenseSet.h" 26 #include "llvm/ADT/MapVector.h" 27 #include "llvm/ADT/SetOperations.h" 28 #include "llvm/ADT/SmallPtrSet.h" 29 #include "llvm/ADT/SmallSet.h" 30 #include "llvm/ADT/SmallVector.h" 31 #include "llvm/ADT/Statistic.h" 32 #include "llvm/Analysis/MemoryProfileInfo.h" 33 #include "llvm/Analysis/ModuleSummaryAnalysis.h" 34 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 35 #include "llvm/Bitcode/BitcodeReader.h" 36 #include "llvm/IR/Constants.h" 37 #include "llvm/IR/Instructions.h" 38 #include "llvm/IR/Module.h" 39 #include "llvm/IR/ModuleSummaryIndex.h" 40 #include "llvm/Pass.h" 41 #include "llvm/Support/CommandLine.h" 42 #include "llvm/Support/FileSystem.h" 43 #include "llvm/Support/GraphWriter.h" 44 #include "llvm/Support/raw_ostream.h" 45 #include "llvm/Transforms/IPO.h" 46 #include "llvm/Transforms/Utils/Cloning.h" 47 #include <sstream> 48 #include <vector> 49 using namespace llvm; 50 using namespace llvm::memprof; 51 52 #define DEBUG_TYPE "memprof-context-disambiguation" 53 54 STATISTIC(FunctionClonesAnalysis, 55 "Number of function clones created during whole program analysis"); 56 STATISTIC(FunctionClonesThinBackend, 57 "Number of function clones created during ThinLTO backend"); 58 STATISTIC(FunctionsClonedThinBackend, 59 "Number of functions that had clones created during ThinLTO backend"); 60 STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly " 61 "cloned) during whole program analysis"); 62 STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) " 63 "during whole program analysis"); 64 STATISTIC(AllocTypeNotColdThinBackend, 65 "Number of not cold static allocations (possibly cloned) during " 66 "ThinLTO backend"); 67 STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations " 68 "(possibly cloned) during ThinLTO backend"); 69 STATISTIC(OrigAllocsThinBackend, 70 "Number of original (not cloned) allocations with memprof profiles " 71 "during ThinLTO backend"); 72 STATISTIC( 73 AllocVersionsThinBackend, 74 "Number of allocation versions (including clones) during ThinLTO backend"); 75 STATISTIC(MaxAllocVersionsThinBackend, 76 "Maximum number of allocation versions created for an original " 77 "allocation during ThinLTO backend"); 78 STATISTIC(UnclonableAllocsThinBackend, 79 "Number of unclonable ambigous allocations during ThinLTO backend"); 80 STATISTIC(RemovedEdgesWithMismatchedCallees, 81 "Number of edges removed due to mismatched callees (profiled vs IR)"); 82 STATISTIC(FoundProfiledCalleeCount, 83 "Number of profiled callees found via tail calls"); 84 STATISTIC(FoundProfiledCalleeDepth, 85 "Aggregate depth of profiled callees found via tail calls"); 86 STATISTIC(FoundProfiledCalleeMaxDepth, 87 "Maximum depth of profiled callees found via tail calls"); 88 STATISTIC(FoundProfiledCalleeNonUniquelyCount, 89 "Number of profiled callees found via multiple tail call chains"); 90 91 static cl::opt<std::string> DotFilePathPrefix( 92 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, 93 cl::value_desc("filename"), 94 cl::desc("Specify the path prefix of the MemProf dot files.")); 95 96 static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false), 97 cl::Hidden, 98 cl::desc("Export graph to dot files.")); 99 100 static cl::opt<bool> 101 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden, 102 cl::desc("Dump CallingContextGraph to stdout after each stage.")); 103 104 static cl::opt<bool> 105 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden, 106 cl::desc("Perform verification checks on CallingContextGraph.")); 107 108 static cl::opt<bool> 109 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, 110 cl::desc("Perform frequent verification checks on nodes.")); 111 112 static cl::opt<std::string> MemProfImportSummary( 113 "memprof-import-summary", 114 cl::desc("Import summary to use for testing the ThinLTO backend via opt"), 115 cl::Hidden); 116 117 static cl::opt<unsigned> 118 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5), 119 cl::Hidden, 120 cl::desc("Max depth to recursively search for missing " 121 "frames through tail calls.")); 122 123 namespace llvm { 124 // Indicate we are linking with an allocator that supports hot/cold operator 125 // new interfaces. 126 cl::opt<bool> SupportsHotColdNew( 127 "supports-hot-cold-new", cl::init(false), cl::Hidden, 128 cl::desc("Linking with hot/cold operator new interfaces")); 129 } // namespace llvm 130 131 namespace { 132 /// CRTP base for graphs built from either IR or ThinLTO summary index. 133 /// 134 /// The graph represents the call contexts in all memprof metadata on allocation 135 /// calls, with nodes for the allocations themselves, as well as for the calls 136 /// in each context. The graph is initially built from the allocation memprof 137 /// metadata (or summary) MIBs. It is then updated to match calls with callsite 138 /// metadata onto the nodes, updating it to reflect any inlining performed on 139 /// those calls. 140 /// 141 /// Each MIB (representing an allocation's call context with allocation 142 /// behavior) is assigned a unique context id during the graph build. The edges 143 /// and nodes in the graph are decorated with the context ids they carry. This 144 /// is used to correctly update the graph when cloning is performed so that we 145 /// can uniquify the context for a single (possibly cloned) allocation. 146 template <typename DerivedCCG, typename FuncTy, typename CallTy> 147 class CallsiteContextGraph { 148 public: 149 CallsiteContextGraph() = default; 150 CallsiteContextGraph(const CallsiteContextGraph &) = default; 151 CallsiteContextGraph(CallsiteContextGraph &&) = default; 152 153 /// Main entry point to perform analysis and transformations on graph. 154 bool process(); 155 156 /// Perform cloning on the graph necessary to uniquely identify the allocation 157 /// behavior of an allocation based on its context. 158 void identifyClones(); 159 160 /// Assign callsite clones to functions, cloning functions as needed to 161 /// accommodate the combinations of their callsite clones reached by callers. 162 /// For regular LTO this clones functions and callsites in the IR, but for 163 /// ThinLTO the cloning decisions are noted in the summaries and later applied 164 /// in applyImport. 165 bool assignFunctions(); 166 167 void dump() const; 168 void print(raw_ostream &OS) const; 169 170 friend raw_ostream &operator<<(raw_ostream &OS, 171 const CallsiteContextGraph &CCG) { 172 CCG.print(OS); 173 return OS; 174 } 175 176 friend struct GraphTraits< 177 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>; 178 friend struct DOTGraphTraits< 179 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>; 180 181 void exportToDot(std::string Label) const; 182 183 /// Represents a function clone via FuncTy pointer and clone number pair. 184 struct FuncInfo final 185 : public std::pair<FuncTy *, unsigned /*Clone number*/> { 186 using Base = std::pair<FuncTy *, unsigned>; 187 FuncInfo(const Base &B) : Base(B) {} 188 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {} 189 explicit operator bool() const { return this->first != nullptr; } 190 FuncTy *func() const { return this->first; } 191 unsigned cloneNo() const { return this->second; } 192 }; 193 194 /// Represents a callsite clone via CallTy and clone number pair. 195 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> { 196 using Base = std::pair<CallTy, unsigned>; 197 CallInfo(const Base &B) : Base(B) {} 198 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0) 199 : Base(Call, CloneNo) {} 200 explicit operator bool() const { return (bool)this->first; } 201 CallTy call() const { return this->first; } 202 unsigned cloneNo() const { return this->second; } 203 void setCloneNo(unsigned N) { this->second = N; } 204 void print(raw_ostream &OS) const { 205 if (!operator bool()) { 206 assert(!cloneNo()); 207 OS << "null Call"; 208 return; 209 } 210 call()->print(OS); 211 OS << "\t(clone " << cloneNo() << ")"; 212 } 213 void dump() const { 214 print(dbgs()); 215 dbgs() << "\n"; 216 } 217 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) { 218 Call.print(OS); 219 return OS; 220 } 221 }; 222 223 struct ContextEdge; 224 225 /// Node in the Callsite Context Graph 226 struct ContextNode { 227 // Keep this for now since in the IR case where we have an Instruction* it 228 // is not as immediately discoverable. Used for printing richer information 229 // when dumping graph. 230 bool IsAllocation; 231 232 // Keeps track of when the Call was reset to null because there was 233 // recursion. 234 bool Recursive = false; 235 236 // The corresponding allocation or interior call. 237 CallInfo Call; 238 239 // For alloc nodes this is a unique id assigned when constructed, and for 240 // callsite stack nodes it is the original stack id when the node is 241 // constructed from the memprof MIB metadata on the alloc nodes. Note that 242 // this is only used when matching callsite metadata onto the stack nodes 243 // created when processing the allocation memprof MIBs, and for labeling 244 // nodes in the dot graph. Therefore we don't bother to assign a value for 245 // clones. 246 uint64_t OrigStackOrAllocId = 0; 247 248 // This will be formed by ORing together the AllocationType enum values 249 // for contexts including this node. 250 uint8_t AllocTypes = 0; 251 252 // Edges to all callees in the profiled call stacks. 253 // TODO: Should this be a map (from Callee node) for more efficient lookup? 254 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges; 255 256 // Edges to all callers in the profiled call stacks. 257 // TODO: Should this be a map (from Caller node) for more efficient lookup? 258 std::vector<std::shared_ptr<ContextEdge>> CallerEdges; 259 260 // The set of IDs for contexts including this node. 261 DenseSet<uint32_t> ContextIds; 262 263 // List of clones of this ContextNode, initially empty. 264 std::vector<ContextNode *> Clones; 265 266 // If a clone, points to the original uncloned node. 267 ContextNode *CloneOf = nullptr; 268 269 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {} 270 271 ContextNode(bool IsAllocation, CallInfo C) 272 : IsAllocation(IsAllocation), Call(C) {} 273 274 void addClone(ContextNode *Clone) { 275 if (CloneOf) { 276 CloneOf->Clones.push_back(Clone); 277 Clone->CloneOf = CloneOf; 278 } else { 279 Clones.push_back(Clone); 280 assert(!Clone->CloneOf); 281 Clone->CloneOf = this; 282 } 283 } 284 285 ContextNode *getOrigNode() { 286 if (!CloneOf) 287 return this; 288 return CloneOf; 289 } 290 291 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType, 292 unsigned int ContextId); 293 294 ContextEdge *findEdgeFromCallee(const ContextNode *Callee); 295 ContextEdge *findEdgeFromCaller(const ContextNode *Caller); 296 void eraseCalleeEdge(const ContextEdge *Edge); 297 void eraseCallerEdge(const ContextEdge *Edge); 298 299 void setCall(CallInfo C) { Call = C; } 300 301 bool hasCall() const { return (bool)Call.call(); } 302 303 void printCall(raw_ostream &OS) const { Call.print(OS); } 304 305 // True if this node was effectively removed from the graph, in which case 306 // its context id set, caller edges, and callee edges should all be empty. 307 bool isRemoved() const { 308 assert(ContextIds.empty() == 309 (CalleeEdges.empty() && CallerEdges.empty())); 310 return ContextIds.empty(); 311 } 312 313 void dump() const; 314 void print(raw_ostream &OS) const; 315 316 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) { 317 Node.print(OS); 318 return OS; 319 } 320 }; 321 322 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or 323 /// callee. 324 struct ContextEdge { 325 ContextNode *Callee; 326 ContextNode *Caller; 327 328 // This will be formed by ORing together the AllocationType enum values 329 // for contexts including this edge. 330 uint8_t AllocTypes = 0; 331 332 // The set of IDs for contexts including this edge. 333 DenseSet<uint32_t> ContextIds; 334 335 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType, 336 DenseSet<uint32_t> ContextIds) 337 : Callee(Callee), Caller(Caller), AllocTypes(AllocType), 338 ContextIds(ContextIds) {} 339 340 DenseSet<uint32_t> &getContextIds() { return ContextIds; } 341 342 void dump() const; 343 void print(raw_ostream &OS) const; 344 345 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) { 346 Edge.print(OS); 347 return OS; 348 } 349 }; 350 351 /// Helper to remove callee edges that have allocation type None (due to not 352 /// carrying any context ids) after transformations. 353 void removeNoneTypeCalleeEdges(ContextNode *Node); 354 355 protected: 356 /// Get a list of nodes corresponding to the stack ids in the given callsite 357 /// context. 358 template <class NodeT, class IteratorT> 359 std::vector<uint64_t> 360 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext); 361 362 /// Adds nodes for the given allocation and any stack ids on its memprof MIB 363 /// metadata (or summary). 364 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F); 365 366 /// Adds nodes for the given MIB stack ids. 367 template <class NodeT, class IteratorT> 368 void addStackNodesForMIB(ContextNode *AllocNode, 369 CallStack<NodeT, IteratorT> &StackContext, 370 CallStack<NodeT, IteratorT> &CallsiteContext, 371 AllocationType AllocType); 372 373 /// Matches all callsite metadata (or summary) to the nodes created for 374 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any 375 /// inlining performed on those callsite instructions. 376 void updateStackNodes(); 377 378 /// Update graph to conservatively handle any callsite stack nodes that target 379 /// multiple different callee target functions. 380 void handleCallsitesWithMultipleTargets(); 381 382 /// Save lists of calls with MemProf metadata in each function, for faster 383 /// iteration. 384 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata; 385 386 /// Map from callsite node to the enclosing caller function. 387 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc; 388 389 private: 390 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator; 391 392 using CallContextInfo = std::tuple<CallTy, std::vector<uint64_t>, 393 const FuncTy *, DenseSet<uint32_t>>; 394 395 /// Assigns the given Node to calls at or inlined into the location with 396 /// the Node's stack id, after post order traversing and processing its 397 /// caller nodes. Uses the call information recorded in the given 398 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences 399 /// as needed. Called by updateStackNodes which sets up the given 400 /// StackIdToMatchingCalls map. 401 void assignStackNodesPostOrder( 402 ContextNode *Node, DenseSet<const ContextNode *> &Visited, 403 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls); 404 405 /// Duplicates the given set of context ids, updating the provided 406 /// map from each original id with the newly generated context ids, 407 /// and returning the new duplicated id set. 408 DenseSet<uint32_t> duplicateContextIds( 409 const DenseSet<uint32_t> &StackSequenceContextIds, 410 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds); 411 412 /// Propagates all duplicated context ids across the graph. 413 void propagateDuplicateContextIds( 414 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds); 415 416 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true, 417 /// else to its callers. Also updates OrigNode's edges to remove any context 418 /// ids moved to the newly created edge. 419 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode, 420 bool TowardsCallee); 421 422 /// Get the stack id corresponding to the given Id or Index (for IR this will 423 /// return itself, for a summary index this will return the id recorded in the 424 /// index for that stack id index value). 425 uint64_t getStackId(uint64_t IdOrIndex) const { 426 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex); 427 } 428 429 /// Returns true if the given call targets the callee of the given edge, or if 430 /// we were able to identify the call chain through intermediate tail calls. 431 /// In the latter case new context nodes are added to the graph for the 432 /// identified tail calls, and their synthesized nodes are added to 433 /// TailCallToContextNodeMap. The EdgeIter is updated in either case to the 434 /// next element after the input position (either incremented or updated after 435 /// removing the old edge). 436 bool 437 calleesMatch(CallTy Call, EdgeIter &EI, 438 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap); 439 440 /// Returns true if the given call targets the given function, or if we were 441 /// able to identify the call chain through intermediate tail calls (in which 442 /// case FoundCalleeChain will be populated). 443 bool calleeMatchesFunc( 444 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc, 445 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) { 446 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc( 447 Call, Func, CallerFunc, FoundCalleeChain); 448 } 449 450 /// Get a list of nodes corresponding to the stack ids in the given 451 /// callsite's context. 452 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) { 453 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall( 454 Call); 455 } 456 457 /// Get the last stack id in the context for callsite. 458 uint64_t getLastStackId(CallTy Call) { 459 return static_cast<DerivedCCG *>(this)->getLastStackId(Call); 460 } 461 462 /// Update the allocation call to record type of allocated memory. 463 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) { 464 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++; 465 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType); 466 } 467 468 /// Update non-allocation call to invoke (possibly cloned) function 469 /// CalleeFunc. 470 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) { 471 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc); 472 } 473 474 /// Clone the given function for the given callsite, recording mapping of all 475 /// of the functions tracked calls to their new versions in the CallMap. 476 /// Assigns new clones to clone number CloneNo. 477 FuncInfo cloneFunctionForCallsite( 478 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, 479 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { 480 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite( 481 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo); 482 } 483 484 /// Gets a label to use in the dot graph for the given call clone in the given 485 /// function. 486 std::string getLabel(const FuncTy *Func, const CallTy Call, 487 unsigned CloneNo) const { 488 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo); 489 } 490 491 /// Helpers to find the node corresponding to the given call or stackid. 492 ContextNode *getNodeForInst(const CallInfo &C); 493 ContextNode *getNodeForAlloc(const CallInfo &C); 494 ContextNode *getNodeForStackId(uint64_t StackId); 495 496 /// Removes the node information recorded for the given call. 497 void unsetNodeForInst(const CallInfo &C); 498 499 /// Computes the alloc type corresponding to the given context ids, by 500 /// unioning their recorded alloc types. 501 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds); 502 503 /// Returns the alloction type of the intersection of the contexts of two 504 /// nodes (based on their provided context id sets), optimized for the case 505 /// when Node1Ids is smaller than Node2Ids. 506 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids, 507 const DenseSet<uint32_t> &Node2Ids); 508 509 /// Returns the alloction type of the intersection of the contexts of two 510 /// nodes (based on their provided context id sets). 511 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids, 512 const DenseSet<uint32_t> &Node2Ids); 513 514 /// Create a clone of Edge's callee and move Edge to that new callee node, 515 /// performing the necessary context id and allocation type updates. 516 /// If callee's caller edge iterator is supplied, it is updated when removing 517 /// the edge from that list. 518 ContextNode * 519 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge, 520 EdgeIter *CallerEdgeI = nullptr); 521 522 /// Change the callee of Edge to existing callee clone NewCallee, performing 523 /// the necessary context id and allocation type updates. 524 /// If callee's caller edge iterator is supplied, it is updated when removing 525 /// the edge from that list. 526 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge, 527 ContextNode *NewCallee, 528 EdgeIter *CallerEdgeI = nullptr, 529 bool NewClone = false); 530 531 /// Recursively perform cloning on the graph for the given Node and its 532 /// callers, in order to uniquely identify the allocation behavior of an 533 /// allocation given its context. 534 void identifyClones(ContextNode *Node, 535 DenseSet<const ContextNode *> &Visited); 536 537 /// Map from each context ID to the AllocationType assigned to that context. 538 std::map<uint32_t, AllocationType> ContextIdToAllocationType; 539 540 /// Identifies the context node created for a stack id when adding the MIB 541 /// contexts to the graph. This is used to locate the context nodes when 542 /// trying to assign the corresponding callsites with those stack ids to these 543 /// nodes. 544 std::map<uint64_t, ContextNode *> StackEntryIdToContextNodeMap; 545 546 /// Maps to track the calls to their corresponding nodes in the graph. 547 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap; 548 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap; 549 550 /// Owner of all ContextNode unique_ptrs. 551 std::vector<std::unique_ptr<ContextNode>> NodeOwner; 552 553 /// Perform sanity checks on graph when requested. 554 void check() const; 555 556 /// Keeps track of the last unique context id assigned. 557 unsigned int LastContextId = 0; 558 }; 559 560 template <typename DerivedCCG, typename FuncTy, typename CallTy> 561 using ContextNode = 562 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode; 563 template <typename DerivedCCG, typename FuncTy, typename CallTy> 564 using ContextEdge = 565 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge; 566 template <typename DerivedCCG, typename FuncTy, typename CallTy> 567 using FuncInfo = 568 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo; 569 template <typename DerivedCCG, typename FuncTy, typename CallTy> 570 using CallInfo = 571 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo; 572 573 /// CRTP derived class for graphs built from IR (regular LTO). 574 class ModuleCallsiteContextGraph 575 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function, 576 Instruction *> { 577 public: 578 ModuleCallsiteContextGraph( 579 Module &M, 580 function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter); 581 582 private: 583 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function, 584 Instruction *>; 585 586 uint64_t getStackId(uint64_t IdOrIndex) const; 587 bool calleeMatchesFunc( 588 Instruction *Call, const Function *Func, const Function *CallerFunc, 589 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain); 590 bool findProfiledCalleeThroughTailCalls( 591 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth, 592 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain, 593 bool &FoundMultipleCalleeChains); 594 uint64_t getLastStackId(Instruction *Call); 595 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call); 596 void updateAllocationCall(CallInfo &Call, AllocationType AllocType); 597 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); 598 CallsiteContextGraph<ModuleCallsiteContextGraph, Function, 599 Instruction *>::FuncInfo 600 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, 601 std::map<CallInfo, CallInfo> &CallMap, 602 std::vector<CallInfo> &CallsWithMetadataInFunc, 603 unsigned CloneNo); 604 std::string getLabel(const Function *Func, const Instruction *Call, 605 unsigned CloneNo) const; 606 607 const Module &Mod; 608 function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter; 609 }; 610 611 /// Represents a call in the summary index graph, which can either be an 612 /// allocation or an interior callsite node in an allocation's context. 613 /// Holds a pointer to the corresponding data structure in the index. 614 struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> { 615 IndexCall() : PointerUnion() {} 616 IndexCall(std::nullptr_t) : IndexCall() {} 617 IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {} 618 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {} 619 IndexCall(PointerUnion PT) : PointerUnion(PT) {} 620 621 IndexCall *operator->() { return this; } 622 623 PointerUnion<CallsiteInfo *, AllocInfo *> getBase() const { return *this; } 624 625 void print(raw_ostream &OS) const { 626 if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(getBase())) { 627 OS << *AI; 628 } else { 629 auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(getBase()); 630 assert(CI); 631 OS << *CI; 632 } 633 } 634 }; 635 636 /// CRTP derived class for graphs built from summary index (ThinLTO). 637 class IndexCallsiteContextGraph 638 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, 639 IndexCall> { 640 public: 641 IndexCallsiteContextGraph( 642 ModuleSummaryIndex &Index, 643 function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> 644 isPrevailing); 645 646 ~IndexCallsiteContextGraph() { 647 // Now that we are done with the graph it is safe to add the new 648 // CallsiteInfo structs to the function summary vectors. The graph nodes 649 // point into locations within these vectors, so we don't want to add them 650 // any earlier. 651 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) { 652 auto *FS = I.first; 653 for (auto &Callsite : I.second) 654 FS->addCallsite(*Callsite.second); 655 } 656 } 657 658 private: 659 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, 660 IndexCall>; 661 662 uint64_t getStackId(uint64_t IdOrIndex) const; 663 bool calleeMatchesFunc( 664 IndexCall &Call, const FunctionSummary *Func, 665 const FunctionSummary *CallerFunc, 666 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain); 667 bool findProfiledCalleeThroughTailCalls( 668 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth, 669 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain, 670 bool &FoundMultipleCalleeChains); 671 uint64_t getLastStackId(IndexCall &Call); 672 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call); 673 void updateAllocationCall(CallInfo &Call, AllocationType AllocType); 674 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); 675 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, 676 IndexCall>::FuncInfo 677 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, 678 std::map<CallInfo, CallInfo> &CallMap, 679 std::vector<CallInfo> &CallsWithMetadataInFunc, 680 unsigned CloneNo); 681 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call, 682 unsigned CloneNo) const; 683 684 // Saves mapping from function summaries containing memprof records back to 685 // its VI, for use in checking and debugging. 686 std::map<const FunctionSummary *, ValueInfo> FSToVIMap; 687 688 const ModuleSummaryIndex &Index; 689 function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> 690 isPrevailing; 691 692 // Saves/owns the callsite info structures synthesized for missing tail call 693 // frames that we discover while building the graph. 694 // It maps from the summary of the function making the tail call, to a map 695 // of callee ValueInfo to corresponding synthesized callsite info. 696 std::unordered_map<FunctionSummary *, 697 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>> 698 FunctionCalleesToSynthesizedCallsiteInfos; 699 }; 700 } // namespace 701 702 namespace llvm { 703 template <> 704 struct DenseMapInfo<typename CallsiteContextGraph< 705 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo> 706 : public DenseMapInfo<std::pair<Instruction *, unsigned>> {}; 707 template <> 708 struct DenseMapInfo<typename CallsiteContextGraph< 709 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo> 710 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {}; 711 template <> 712 struct DenseMapInfo<IndexCall> 713 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {}; 714 } // end namespace llvm 715 716 namespace { 717 718 struct FieldSeparator { 719 bool Skip = true; 720 const char *Sep; 721 722 FieldSeparator(const char *Sep = ", ") : Sep(Sep) {} 723 }; 724 725 raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) { 726 if (FS.Skip) { 727 FS.Skip = false; 728 return OS; 729 } 730 return OS << FS.Sep; 731 } 732 733 // Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc 734 // type we should actually use on the corresponding allocation. 735 // If we can't clone a node that has NotCold+Cold alloc type, we will fall 736 // back to using NotCold. So don't bother cloning to distinguish NotCold+Cold 737 // from NotCold. 738 AllocationType allocTypeToUse(uint8_t AllocTypes) { 739 assert(AllocTypes != (uint8_t)AllocationType::None); 740 if (AllocTypes == 741 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold)) 742 return AllocationType::NotCold; 743 else 744 return (AllocationType)AllocTypes; 745 } 746 747 // Helper to check if the alloc types for all edges recorded in the 748 // InAllocTypes vector match the alloc types for all edges in the Edges 749 // vector. 750 template <typename DerivedCCG, typename FuncTy, typename CallTy> 751 bool allocTypesMatch( 752 const std::vector<uint8_t> &InAllocTypes, 753 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>> 754 &Edges) { 755 return std::equal( 756 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), 757 [](const uint8_t &l, 758 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) { 759 // Can share if one of the edges is None type - don't 760 // care about the type along that edge as it doesn't 761 // exist for those context ids. 762 if (l == (uint8_t)AllocationType::None || 763 r->AllocTypes == (uint8_t)AllocationType::None) 764 return true; 765 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes); 766 }); 767 } 768 769 } // end anonymous namespace 770 771 template <typename DerivedCCG, typename FuncTy, typename CallTy> 772 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * 773 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst( 774 const CallInfo &C) { 775 ContextNode *Node = getNodeForAlloc(C); 776 if (Node) 777 return Node; 778 779 return NonAllocationCallToContextNodeMap.lookup(C); 780 } 781 782 template <typename DerivedCCG, typename FuncTy, typename CallTy> 783 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * 784 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc( 785 const CallInfo &C) { 786 return AllocationCallToContextNodeMap.lookup(C); 787 } 788 789 template <typename DerivedCCG, typename FuncTy, typename CallTy> 790 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * 791 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId( 792 uint64_t StackId) { 793 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId); 794 if (StackEntryNode != StackEntryIdToContextNodeMap.end()) 795 return StackEntryNode->second; 796 return nullptr; 797 } 798 799 template <typename DerivedCCG, typename FuncTy, typename CallTy> 800 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::unsetNodeForInst( 801 const CallInfo &C) { 802 AllocationCallToContextNodeMap.erase(C) || 803 NonAllocationCallToContextNodeMap.erase(C); 804 assert(!AllocationCallToContextNodeMap.count(C) && 805 !NonAllocationCallToContextNodeMap.count(C)); 806 } 807 808 template <typename DerivedCCG, typename FuncTy, typename CallTy> 809 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: 810 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType, 811 unsigned int ContextId) { 812 for (auto &Edge : CallerEdges) { 813 if (Edge->Caller == Caller) { 814 Edge->AllocTypes |= (uint8_t)AllocType; 815 Edge->getContextIds().insert(ContextId); 816 return; 817 } 818 } 819 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>( 820 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId})); 821 CallerEdges.push_back(Edge); 822 Caller->CalleeEdges.push_back(Edge); 823 } 824 825 template <typename DerivedCCG, typename FuncTy, typename CallTy> 826 void CallsiteContextGraph< 827 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) { 828 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) { 829 auto Edge = *EI; 830 if (Edge->AllocTypes == (uint8_t)AllocationType::None) { 831 assert(Edge->ContextIds.empty()); 832 Edge->Callee->eraseCallerEdge(Edge.get()); 833 EI = Node->CalleeEdges.erase(EI); 834 } else 835 ++EI; 836 } 837 } 838 839 template <typename DerivedCCG, typename FuncTy, typename CallTy> 840 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge * 841 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: 842 findEdgeFromCallee(const ContextNode *Callee) { 843 for (const auto &Edge : CalleeEdges) 844 if (Edge->Callee == Callee) 845 return Edge.get(); 846 return nullptr; 847 } 848 849 template <typename DerivedCCG, typename FuncTy, typename CallTy> 850 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge * 851 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: 852 findEdgeFromCaller(const ContextNode *Caller) { 853 for (const auto &Edge : CallerEdges) 854 if (Edge->Caller == Caller) 855 return Edge.get(); 856 return nullptr; 857 } 858 859 template <typename DerivedCCG, typename FuncTy, typename CallTy> 860 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: 861 eraseCalleeEdge(const ContextEdge *Edge) { 862 auto EI = llvm::find_if( 863 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) { 864 return CalleeEdge.get() == Edge; 865 }); 866 assert(EI != CalleeEdges.end()); 867 CalleeEdges.erase(EI); 868 } 869 870 template <typename DerivedCCG, typename FuncTy, typename CallTy> 871 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: 872 eraseCallerEdge(const ContextEdge *Edge) { 873 auto EI = llvm::find_if( 874 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) { 875 return CallerEdge.get() == Edge; 876 }); 877 assert(EI != CallerEdges.end()); 878 CallerEdges.erase(EI); 879 } 880 881 template <typename DerivedCCG, typename FuncTy, typename CallTy> 882 uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType( 883 DenseSet<uint32_t> &ContextIds) { 884 uint8_t BothTypes = 885 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; 886 uint8_t AllocType = (uint8_t)AllocationType::None; 887 for (auto Id : ContextIds) { 888 AllocType |= (uint8_t)ContextIdToAllocationType[Id]; 889 // Bail early if alloc type reached both, no further refinement. 890 if (AllocType == BothTypes) 891 return AllocType; 892 } 893 return AllocType; 894 } 895 896 template <typename DerivedCCG, typename FuncTy, typename CallTy> 897 uint8_t 898 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl( 899 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) { 900 uint8_t BothTypes = 901 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; 902 uint8_t AllocType = (uint8_t)AllocationType::None; 903 for (auto Id : Node1Ids) { 904 if (!Node2Ids.count(Id)) 905 continue; 906 AllocType |= (uint8_t)ContextIdToAllocationType[Id]; 907 // Bail early if alloc type reached both, no further refinement. 908 if (AllocType == BothTypes) 909 return AllocType; 910 } 911 return AllocType; 912 } 913 914 template <typename DerivedCCG, typename FuncTy, typename CallTy> 915 uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes( 916 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) { 917 if (Node1Ids.size() < Node2Ids.size()) 918 return intersectAllocTypesImpl(Node1Ids, Node2Ids); 919 else 920 return intersectAllocTypesImpl(Node2Ids, Node1Ids); 921 } 922 923 template <typename DerivedCCG, typename FuncTy, typename CallTy> 924 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * 925 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode( 926 CallInfo Call, const FuncTy *F) { 927 assert(!getNodeForAlloc(Call)); 928 NodeOwner.push_back( 929 std::make_unique<ContextNode>(/*IsAllocation=*/true, Call)); 930 ContextNode *AllocNode = NodeOwner.back().get(); 931 AllocationCallToContextNodeMap[Call] = AllocNode; 932 NodeToCallingFunc[AllocNode] = F; 933 // Use LastContextId as a uniq id for MIB allocation nodes. 934 AllocNode->OrigStackOrAllocId = LastContextId; 935 // Alloc type should be updated as we add in the MIBs. We should assert 936 // afterwards that it is not still None. 937 AllocNode->AllocTypes = (uint8_t)AllocationType::None; 938 939 return AllocNode; 940 } 941 942 template <typename DerivedCCG, typename FuncTy, typename CallTy> 943 template <class NodeT, class IteratorT> 944 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB( 945 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext, 946 CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType) { 947 // Treating the hot alloc type as NotCold before the disambiguation for "hot" 948 // is done. 949 if (AllocType == AllocationType::Hot) 950 AllocType = AllocationType::NotCold; 951 952 ContextIdToAllocationType[++LastContextId] = AllocType; 953 954 // Update alloc type and context ids for this MIB. 955 AllocNode->AllocTypes |= (uint8_t)AllocType; 956 AllocNode->ContextIds.insert(LastContextId); 957 958 // Now add or update nodes for each stack id in alloc's context. 959 // Later when processing the stack ids on non-alloc callsites we will adjust 960 // for any inlining in the context. 961 ContextNode *PrevNode = AllocNode; 962 // Look for recursion (direct recursion should have been collapsed by 963 // module summary analysis, here we should just be detecting mutual 964 // recursion). Mark these nodes so we don't try to clone. 965 SmallSet<uint64_t, 8> StackIdSet; 966 // Skip any on the allocation call (inlining). 967 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext); 968 ContextIter != StackContext.end(); ++ContextIter) { 969 auto StackId = getStackId(*ContextIter); 970 ContextNode *StackNode = getNodeForStackId(StackId); 971 if (!StackNode) { 972 NodeOwner.push_back( 973 std::make_unique<ContextNode>(/*IsAllocation=*/false)); 974 StackNode = NodeOwner.back().get(); 975 StackEntryIdToContextNodeMap[StackId] = StackNode; 976 StackNode->OrigStackOrAllocId = StackId; 977 } 978 auto Ins = StackIdSet.insert(StackId); 979 if (!Ins.second) 980 StackNode->Recursive = true; 981 StackNode->ContextIds.insert(LastContextId); 982 StackNode->AllocTypes |= (uint8_t)AllocType; 983 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId); 984 PrevNode = StackNode; 985 } 986 } 987 988 template <typename DerivedCCG, typename FuncTy, typename CallTy> 989 DenseSet<uint32_t> 990 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds( 991 const DenseSet<uint32_t> &StackSequenceContextIds, 992 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) { 993 DenseSet<uint32_t> NewContextIds; 994 for (auto OldId : StackSequenceContextIds) { 995 NewContextIds.insert(++LastContextId); 996 OldToNewContextIds[OldId].insert(LastContextId); 997 assert(ContextIdToAllocationType.count(OldId)); 998 // The new context has the same allocation type as original. 999 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId]; 1000 } 1001 return NewContextIds; 1002 } 1003 1004 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1005 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: 1006 propagateDuplicateContextIds( 1007 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) { 1008 // Build a set of duplicated context ids corresponding to the input id set. 1009 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) { 1010 DenseSet<uint32_t> NewIds; 1011 for (auto Id : ContextIds) 1012 if (auto NewId = OldToNewContextIds.find(Id); 1013 NewId != OldToNewContextIds.end()) 1014 NewIds.insert(NewId->second.begin(), NewId->second.end()); 1015 return NewIds; 1016 }; 1017 1018 // Recursively update context ids sets along caller edges. 1019 auto UpdateCallers = [&](ContextNode *Node, 1020 DenseSet<const ContextEdge *> &Visited, 1021 auto &&UpdateCallers) -> void { 1022 for (const auto &Edge : Node->CallerEdges) { 1023 auto Inserted = Visited.insert(Edge.get()); 1024 if (!Inserted.second) 1025 continue; 1026 ContextNode *NextNode = Edge->Caller; 1027 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds()); 1028 // Only need to recursively iterate to NextNode via this caller edge if 1029 // it resulted in any added ids to NextNode. 1030 if (!NewIdsToAdd.empty()) { 1031 Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end()); 1032 NextNode->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end()); 1033 UpdateCallers(NextNode, Visited, UpdateCallers); 1034 } 1035 } 1036 }; 1037 1038 DenseSet<const ContextEdge *> Visited; 1039 for (auto &Entry : AllocationCallToContextNodeMap) { 1040 auto *Node = Entry.second; 1041 // Update ids on the allocation nodes before calling the recursive 1042 // update along caller edges, since this simplifies the logic during 1043 // that traversal. 1044 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Node->ContextIds); 1045 Node->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end()); 1046 UpdateCallers(Node, Visited, UpdateCallers); 1047 } 1048 } 1049 1050 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1051 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode( 1052 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee) { 1053 // Make a copy of the context ids, since this will be adjusted below as they 1054 // are moved. 1055 DenseSet<uint32_t> RemainingContextIds = NewNode->ContextIds; 1056 auto &OrigEdges = 1057 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges; 1058 // Increment iterator in loop so that we can remove edges as needed. 1059 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) { 1060 auto Edge = *EI; 1061 // Remove any matching context ids from Edge, return set that were found and 1062 // removed, these are the new edge's context ids. Also update the remaining 1063 // (not found ids). 1064 DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds; 1065 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds, 1066 NotFoundContextIds); 1067 RemainingContextIds.swap(NotFoundContextIds); 1068 // If no matching context ids for this edge, skip it. 1069 if (NewEdgeContextIds.empty()) { 1070 ++EI; 1071 continue; 1072 } 1073 if (TowardsCallee) { 1074 auto NewEdge = std::make_shared<ContextEdge>( 1075 Edge->Callee, NewNode, computeAllocType(NewEdgeContextIds), 1076 NewEdgeContextIds); 1077 NewNode->CalleeEdges.push_back(NewEdge); 1078 NewEdge->Callee->CallerEdges.push_back(NewEdge); 1079 } else { 1080 auto NewEdge = std::make_shared<ContextEdge>( 1081 NewNode, Edge->Caller, computeAllocType(NewEdgeContextIds), 1082 NewEdgeContextIds); 1083 NewNode->CallerEdges.push_back(NewEdge); 1084 NewEdge->Caller->CalleeEdges.push_back(NewEdge); 1085 } 1086 // Remove old edge if context ids empty. 1087 if (Edge->getContextIds().empty()) { 1088 if (TowardsCallee) { 1089 Edge->Callee->eraseCallerEdge(Edge.get()); 1090 EI = OrigNode->CalleeEdges.erase(EI); 1091 } else { 1092 Edge->Caller->eraseCalleeEdge(Edge.get()); 1093 EI = OrigNode->CallerEdges.erase(EI); 1094 } 1095 continue; 1096 } 1097 ++EI; 1098 } 1099 } 1100 1101 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1102 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: 1103 assignStackNodesPostOrder(ContextNode *Node, 1104 DenseSet<const ContextNode *> &Visited, 1105 DenseMap<uint64_t, std::vector<CallContextInfo>> 1106 &StackIdToMatchingCalls) { 1107 auto Inserted = Visited.insert(Node); 1108 if (!Inserted.second) 1109 return; 1110 // Post order traversal. Iterate over a copy since we may add nodes and 1111 // therefore new callers during the recursive call, invalidating any 1112 // iterator over the original edge vector. We don't need to process these 1113 // new nodes as they were already processed on creation. 1114 auto CallerEdges = Node->CallerEdges; 1115 for (auto &Edge : CallerEdges) { 1116 // Skip any that have been removed during the recursion. 1117 if (!Edge) 1118 continue; 1119 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls); 1120 } 1121 1122 // If this node's stack id is in the map, update the graph to contain new 1123 // nodes representing any inlining at interior callsites. Note we move the 1124 // associated context ids over to the new nodes. 1125 1126 // Ignore this node if it is for an allocation or we didn't record any 1127 // stack id lists ending at it. 1128 if (Node->IsAllocation || 1129 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId)) 1130 return; 1131 1132 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId]; 1133 // Handle the simple case first. A single call with a single stack id. 1134 // In this case there is no need to create any new context nodes, simply 1135 // assign the context node for stack id to this Call. 1136 if (Calls.size() == 1) { 1137 auto &[Call, Ids, Func, SavedContextIds] = Calls[0]; 1138 if (Ids.size() == 1) { 1139 assert(SavedContextIds.empty()); 1140 // It should be this Node 1141 assert(Node == getNodeForStackId(Ids[0])); 1142 if (Node->Recursive) 1143 return; 1144 Node->setCall(Call); 1145 NonAllocationCallToContextNodeMap[Call] = Node; 1146 NodeToCallingFunc[Node] = Func; 1147 return; 1148 } 1149 } 1150 1151 // Find the node for the last stack id, which should be the same 1152 // across all calls recorded for this id, and is this node's id. 1153 uint64_t LastId = Node->OrigStackOrAllocId; 1154 ContextNode *LastNode = getNodeForStackId(LastId); 1155 // We should only have kept stack ids that had nodes. 1156 assert(LastNode); 1157 1158 for (unsigned I = 0; I < Calls.size(); I++) { 1159 auto &[Call, Ids, Func, SavedContextIds] = Calls[I]; 1160 // Skip any for which we didn't assign any ids, these don't get a node in 1161 // the graph. 1162 if (SavedContextIds.empty()) 1163 continue; 1164 1165 assert(LastId == Ids.back()); 1166 1167 ContextNode *FirstNode = getNodeForStackId(Ids[0]); 1168 assert(FirstNode); 1169 1170 // Recompute the context ids for this stack id sequence (the 1171 // intersection of the context ids of the corresponding nodes). 1172 // Start with the ids we saved in the map for this call, which could be 1173 // duplicated context ids. We have to recompute as we might have overlap 1174 // overlap between the saved context ids for different last nodes, and 1175 // removed them already during the post order traversal. 1176 set_intersect(SavedContextIds, FirstNode->ContextIds); 1177 ContextNode *PrevNode = nullptr; 1178 for (auto Id : Ids) { 1179 ContextNode *CurNode = getNodeForStackId(Id); 1180 // We should only have kept stack ids that had nodes and weren't 1181 // recursive. 1182 assert(CurNode); 1183 assert(!CurNode->Recursive); 1184 if (!PrevNode) { 1185 PrevNode = CurNode; 1186 continue; 1187 } 1188 auto *Edge = CurNode->findEdgeFromCallee(PrevNode); 1189 if (!Edge) { 1190 SavedContextIds.clear(); 1191 break; 1192 } 1193 PrevNode = CurNode; 1194 set_intersect(SavedContextIds, Edge->getContextIds()); 1195 1196 // If we now have no context ids for clone, skip this call. 1197 if (SavedContextIds.empty()) 1198 break; 1199 } 1200 if (SavedContextIds.empty()) 1201 continue; 1202 1203 // Create new context node. 1204 NodeOwner.push_back( 1205 std::make_unique<ContextNode>(/*IsAllocation=*/false, Call)); 1206 ContextNode *NewNode = NodeOwner.back().get(); 1207 NodeToCallingFunc[NewNode] = Func; 1208 NonAllocationCallToContextNodeMap[Call] = NewNode; 1209 NewNode->ContextIds = SavedContextIds; 1210 NewNode->AllocTypes = computeAllocType(NewNode->ContextIds); 1211 1212 // Connect to callees of innermost stack frame in inlined call chain. 1213 // This updates context ids for FirstNode's callee's to reflect those 1214 // moved to NewNode. 1215 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true); 1216 1217 // Connect to callers of outermost stack frame in inlined call chain. 1218 // This updates context ids for FirstNode's caller's to reflect those 1219 // moved to NewNode. 1220 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false); 1221 1222 // Now we need to remove context ids from edges/nodes between First and 1223 // Last Node. 1224 PrevNode = nullptr; 1225 for (auto Id : Ids) { 1226 ContextNode *CurNode = getNodeForStackId(Id); 1227 // We should only have kept stack ids that had nodes. 1228 assert(CurNode); 1229 1230 // Remove the context ids moved to NewNode from CurNode, and the 1231 // edge from the prior node. 1232 set_subtract(CurNode->ContextIds, NewNode->ContextIds); 1233 if (PrevNode) { 1234 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode); 1235 assert(PrevEdge); 1236 set_subtract(PrevEdge->getContextIds(), NewNode->ContextIds); 1237 if (PrevEdge->getContextIds().empty()) { 1238 PrevNode->eraseCallerEdge(PrevEdge); 1239 CurNode->eraseCalleeEdge(PrevEdge); 1240 } 1241 } 1242 PrevNode = CurNode; 1243 } 1244 } 1245 } 1246 1247 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1248 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() { 1249 // Map of stack id to all calls with that as the last (outermost caller) 1250 // callsite id that has a context node (some might not due to pruning 1251 // performed during matching of the allocation profile contexts). 1252 // The CallContextInfo contains the Call and a list of its stack ids with 1253 // ContextNodes, the function containing Call, and the set of context ids 1254 // the analysis will eventually identify for use in any new node created 1255 // for that callsite. 1256 DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls; 1257 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) { 1258 for (auto &Call : CallsWithMetadata) { 1259 // Ignore allocations, already handled. 1260 if (AllocationCallToContextNodeMap.count(Call)) 1261 continue; 1262 auto StackIdsWithContextNodes = 1263 getStackIdsWithContextNodesForCall(Call.call()); 1264 // If there were no nodes created for MIBs on allocs (maybe this was in 1265 // the unambiguous part of the MIB stack that was pruned), ignore. 1266 if (StackIdsWithContextNodes.empty()) 1267 continue; 1268 // Otherwise, record this Call along with the list of ids for the last 1269 // (outermost caller) stack id with a node. 1270 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back( 1271 {Call.call(), StackIdsWithContextNodes, Func, {}}); 1272 } 1273 } 1274 1275 // First make a pass through all stack ids that correspond to a call, 1276 // as identified in the above loop. Compute the context ids corresponding to 1277 // each of these calls when they correspond to multiple stack ids due to 1278 // due to inlining. Perform any duplication of context ids required when 1279 // there is more than one call with the same stack ids. Their (possibly newly 1280 // duplicated) context ids are saved in the StackIdToMatchingCalls map. 1281 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds; 1282 for (auto &It : StackIdToMatchingCalls) { 1283 auto &Calls = It.getSecond(); 1284 // Skip single calls with a single stack id. These don't need a new node. 1285 if (Calls.size() == 1) { 1286 auto &Ids = std::get<1>(Calls[0]); 1287 if (Ids.size() == 1) 1288 continue; 1289 } 1290 // In order to do the best and maximal matching of inlined calls to context 1291 // node sequences we will sort the vectors of stack ids in descending order 1292 // of length, and within each length, lexicographically by stack id. The 1293 // latter is so that we can specially handle calls that have identical stack 1294 // id sequences (either due to cloning or artificially because of the MIB 1295 // context pruning). 1296 std::stable_sort(Calls.begin(), Calls.end(), 1297 [](const CallContextInfo &A, const CallContextInfo &B) { 1298 auto &IdsA = std::get<1>(A); 1299 auto &IdsB = std::get<1>(B); 1300 return IdsA.size() > IdsB.size() || 1301 (IdsA.size() == IdsB.size() && IdsA < IdsB); 1302 }); 1303 1304 // Find the node for the last stack id, which should be the same 1305 // across all calls recorded for this id, and is the id for this 1306 // entry in the StackIdToMatchingCalls map. 1307 uint64_t LastId = It.getFirst(); 1308 ContextNode *LastNode = getNodeForStackId(LastId); 1309 // We should only have kept stack ids that had nodes. 1310 assert(LastNode); 1311 1312 if (LastNode->Recursive) 1313 continue; 1314 1315 // Initialize the context ids with the last node's. We will subsequently 1316 // refine the context ids by computing the intersection along all edges. 1317 DenseSet<uint32_t> LastNodeContextIds = LastNode->ContextIds; 1318 assert(!LastNodeContextIds.empty()); 1319 1320 for (unsigned I = 0; I < Calls.size(); I++) { 1321 auto &[Call, Ids, Func, SavedContextIds] = Calls[I]; 1322 assert(SavedContextIds.empty()); 1323 assert(LastId == Ids.back()); 1324 1325 // First compute the context ids for this stack id sequence (the 1326 // intersection of the context ids of the corresponding nodes). 1327 // Start with the remaining saved ids for the last node. 1328 assert(!LastNodeContextIds.empty()); 1329 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds; 1330 1331 ContextNode *PrevNode = LastNode; 1332 ContextNode *CurNode = LastNode; 1333 bool Skip = false; 1334 1335 // Iterate backwards through the stack Ids, starting after the last Id 1336 // in the list, which was handled once outside for all Calls. 1337 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) { 1338 auto Id = *IdIter; 1339 CurNode = getNodeForStackId(Id); 1340 // We should only have kept stack ids that had nodes. 1341 assert(CurNode); 1342 1343 if (CurNode->Recursive) { 1344 Skip = true; 1345 break; 1346 } 1347 1348 auto *Edge = CurNode->findEdgeFromCaller(PrevNode); 1349 // If there is no edge then the nodes belong to different MIB contexts, 1350 // and we should skip this inlined context sequence. For example, this 1351 // particular inlined context may include stack ids A->B, and we may 1352 // indeed have nodes for both A and B, but it is possible that they were 1353 // never profiled in sequence in a single MIB for any allocation (i.e. 1354 // we might have profiled an allocation that involves the callsite A, 1355 // but through a different one of its callee callsites, and we might 1356 // have profiled an allocation that involves callsite B, but reached 1357 // from a different caller callsite). 1358 if (!Edge) { 1359 Skip = true; 1360 break; 1361 } 1362 PrevNode = CurNode; 1363 1364 // Update the context ids, which is the intersection of the ids along 1365 // all edges in the sequence. 1366 set_intersect(StackSequenceContextIds, Edge->getContextIds()); 1367 1368 // If we now have no context ids for clone, skip this call. 1369 if (StackSequenceContextIds.empty()) { 1370 Skip = true; 1371 break; 1372 } 1373 } 1374 if (Skip) 1375 continue; 1376 1377 // If some of this call's stack ids did not have corresponding nodes (due 1378 // to pruning), don't include any context ids for contexts that extend 1379 // beyond these nodes. Otherwise we would be matching part of unrelated / 1380 // not fully matching stack contexts. To do this, subtract any context ids 1381 // found in caller nodes of the last node found above. 1382 if (Ids.back() != getLastStackId(Call)) { 1383 for (const auto &PE : CurNode->CallerEdges) { 1384 set_subtract(StackSequenceContextIds, PE->getContextIds()); 1385 if (StackSequenceContextIds.empty()) 1386 break; 1387 } 1388 // If we now have no context ids for clone, skip this call. 1389 if (StackSequenceContextIds.empty()) 1390 continue; 1391 } 1392 1393 // Check if the next set of stack ids is the same (since the Calls vector 1394 // of tuples is sorted by the stack ids we can just look at the next one). 1395 bool DuplicateContextIds = false; 1396 if (I + 1 < Calls.size()) { 1397 auto NextIds = std::get<1>(Calls[I + 1]); 1398 DuplicateContextIds = Ids == NextIds; 1399 } 1400 1401 // If we don't have duplicate context ids, then we can assign all the 1402 // context ids computed for the original node sequence to this call. 1403 // If there are duplicate calls with the same stack ids then we synthesize 1404 // new context ids that are duplicates of the originals. These are 1405 // assigned to SavedContextIds, which is a reference into the map entry 1406 // for this call, allowing us to access these ids later on. 1407 OldToNewContextIds.reserve(OldToNewContextIds.size() + 1408 StackSequenceContextIds.size()); 1409 SavedContextIds = 1410 DuplicateContextIds 1411 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds) 1412 : StackSequenceContextIds; 1413 assert(!SavedContextIds.empty()); 1414 1415 if (!DuplicateContextIds) { 1416 // Update saved last node's context ids to remove those that are 1417 // assigned to other calls, so that it is ready for the next call at 1418 // this stack id. 1419 set_subtract(LastNodeContextIds, StackSequenceContextIds); 1420 if (LastNodeContextIds.empty()) 1421 break; 1422 } 1423 } 1424 } 1425 1426 // Propagate the duplicate context ids over the graph. 1427 propagateDuplicateContextIds(OldToNewContextIds); 1428 1429 if (VerifyCCG) 1430 check(); 1431 1432 // Now perform a post-order traversal over the graph, starting with the 1433 // allocation nodes, essentially processing nodes from callers to callees. 1434 // For any that contains an id in the map, update the graph to contain new 1435 // nodes representing any inlining at interior callsites. Note we move the 1436 // associated context ids over to the new nodes. 1437 DenseSet<const ContextNode *> Visited; 1438 for (auto &Entry : AllocationCallToContextNodeMap) 1439 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls); 1440 } 1441 1442 uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) { 1443 CallStack<MDNode, MDNode::op_iterator> CallsiteContext( 1444 Call->getMetadata(LLVMContext::MD_callsite)); 1445 return CallsiteContext.back(); 1446 } 1447 1448 uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) { 1449 assert(isa<CallsiteInfo *>(Call.getBase())); 1450 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator> 1451 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase())); 1452 // Need to convert index into stack id. 1453 return Index.getStackIdAtIndex(CallsiteContext.back()); 1454 } 1455 1456 static const std::string MemProfCloneSuffix = ".memprof."; 1457 1458 static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) { 1459 // We use CloneNo == 0 to refer to the original version, which doesn't get 1460 // renamed with a suffix. 1461 if (!CloneNo) 1462 return Base.str(); 1463 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str(); 1464 } 1465 1466 std::string ModuleCallsiteContextGraph::getLabel(const Function *Func, 1467 const Instruction *Call, 1468 unsigned CloneNo) const { 1469 return (Twine(Call->getFunction()->getName()) + " -> " + 1470 cast<CallBase>(Call)->getCalledFunction()->getName()) 1471 .str(); 1472 } 1473 1474 std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func, 1475 const IndexCall &Call, 1476 unsigned CloneNo) const { 1477 auto VI = FSToVIMap.find(Func); 1478 assert(VI != FSToVIMap.end()); 1479 if (isa<AllocInfo *>(Call.getBase())) 1480 return (VI->second.name() + " -> alloc").str(); 1481 else { 1482 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call.getBase()); 1483 return (VI->second.name() + " -> " + 1484 getMemProfFuncName(Callsite->Callee.name(), 1485 Callsite->Clones[CloneNo])) 1486 .str(); 1487 } 1488 } 1489 1490 std::vector<uint64_t> 1491 ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall( 1492 Instruction *Call) { 1493 CallStack<MDNode, MDNode::op_iterator> CallsiteContext( 1494 Call->getMetadata(LLVMContext::MD_callsite)); 1495 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>( 1496 CallsiteContext); 1497 } 1498 1499 std::vector<uint64_t> 1500 IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) { 1501 assert(isa<CallsiteInfo *>(Call.getBase())); 1502 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator> 1503 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase())); 1504 return getStackIdsWithContextNodes<CallsiteInfo, 1505 SmallVector<unsigned>::const_iterator>( 1506 CallsiteContext); 1507 } 1508 1509 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1510 template <class NodeT, class IteratorT> 1511 std::vector<uint64_t> 1512 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes( 1513 CallStack<NodeT, IteratorT> &CallsiteContext) { 1514 std::vector<uint64_t> StackIds; 1515 for (auto IdOrIndex : CallsiteContext) { 1516 auto StackId = getStackId(IdOrIndex); 1517 ContextNode *Node = getNodeForStackId(StackId); 1518 if (!Node) 1519 break; 1520 StackIds.push_back(StackId); 1521 } 1522 return StackIds; 1523 } 1524 1525 ModuleCallsiteContextGraph::ModuleCallsiteContextGraph( 1526 Module &M, function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) 1527 : Mod(M), OREGetter(OREGetter) { 1528 for (auto &F : M) { 1529 std::vector<CallInfo> CallsWithMetadata; 1530 for (auto &BB : F) { 1531 for (auto &I : BB) { 1532 if (!isa<CallBase>(I)) 1533 continue; 1534 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) { 1535 CallsWithMetadata.push_back(&I); 1536 auto *AllocNode = addAllocNode(&I, &F); 1537 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite); 1538 assert(CallsiteMD); 1539 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD); 1540 // Add all of the MIBs and their stack nodes. 1541 for (auto &MDOp : MemProfMD->operands()) { 1542 auto *MIBMD = cast<const MDNode>(MDOp); 1543 MDNode *StackNode = getMIBStackNode(MIBMD); 1544 assert(StackNode); 1545 CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode); 1546 addStackNodesForMIB<MDNode, MDNode::op_iterator>( 1547 AllocNode, StackContext, CallsiteContext, 1548 getMIBAllocType(MIBMD)); 1549 } 1550 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None); 1551 // Memprof and callsite metadata on memory allocations no longer 1552 // needed. 1553 I.setMetadata(LLVMContext::MD_memprof, nullptr); 1554 I.setMetadata(LLVMContext::MD_callsite, nullptr); 1555 } 1556 // For callsite metadata, add to list for this function for later use. 1557 else if (I.getMetadata(LLVMContext::MD_callsite)) 1558 CallsWithMetadata.push_back(&I); 1559 } 1560 } 1561 if (!CallsWithMetadata.empty()) 1562 FuncToCallsWithMetadata[&F] = CallsWithMetadata; 1563 } 1564 1565 if (DumpCCG) { 1566 dbgs() << "CCG before updating call stack chains:\n"; 1567 dbgs() << *this; 1568 } 1569 1570 if (ExportToDot) 1571 exportToDot("prestackupdate"); 1572 1573 updateStackNodes(); 1574 1575 handleCallsitesWithMultipleTargets(); 1576 1577 // Strip off remaining callsite metadata, no longer needed. 1578 for (auto &FuncEntry : FuncToCallsWithMetadata) 1579 for (auto &Call : FuncEntry.second) 1580 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr); 1581 } 1582 1583 IndexCallsiteContextGraph::IndexCallsiteContextGraph( 1584 ModuleSummaryIndex &Index, 1585 function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> 1586 isPrevailing) 1587 : Index(Index), isPrevailing(isPrevailing) { 1588 for (auto &I : Index) { 1589 auto VI = Index.getValueInfo(I); 1590 for (auto &S : VI.getSummaryList()) { 1591 // We should only add the prevailing nodes. Otherwise we may try to clone 1592 // in a weak copy that won't be linked (and may be different than the 1593 // prevailing version). 1594 // We only keep the memprof summary on the prevailing copy now when 1595 // building the combined index, as a space optimization, however don't 1596 // rely on this optimization. The linker doesn't resolve local linkage 1597 // values so don't check whether those are prevailing. 1598 if (!GlobalValue::isLocalLinkage(S->linkage()) && 1599 !isPrevailing(VI.getGUID(), S.get())) 1600 continue; 1601 auto *FS = dyn_cast<FunctionSummary>(S.get()); 1602 if (!FS) 1603 continue; 1604 std::vector<CallInfo> CallsWithMetadata; 1605 if (!FS->allocs().empty()) { 1606 for (auto &AN : FS->mutableAllocs()) { 1607 // This can happen because of recursion elimination handling that 1608 // currently exists in ModuleSummaryAnalysis. Skip these for now. 1609 // We still added them to the summary because we need to be able to 1610 // correlate properly in applyImport in the backends. 1611 if (AN.MIBs.empty()) 1612 continue; 1613 CallsWithMetadata.push_back({&AN}); 1614 auto *AllocNode = addAllocNode({&AN}, FS); 1615 // Pass an empty CallStack to the CallsiteContext (second) 1616 // parameter, since for ThinLTO we already collapsed out the inlined 1617 // stack ids on the allocation call during ModuleSummaryAnalysis. 1618 CallStack<MIBInfo, SmallVector<unsigned>::const_iterator> 1619 EmptyContext; 1620 // Now add all of the MIBs and their stack nodes. 1621 for (auto &MIB : AN.MIBs) { 1622 CallStack<MIBInfo, SmallVector<unsigned>::const_iterator> 1623 StackContext(&MIB); 1624 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>( 1625 AllocNode, StackContext, EmptyContext, MIB.AllocType); 1626 } 1627 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None); 1628 // Initialize version 0 on the summary alloc node to the current alloc 1629 // type, unless it has both types in which case make it default, so 1630 // that in the case where we aren't able to clone the original version 1631 // always ends up with the default allocation behavior. 1632 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes); 1633 } 1634 } 1635 // For callsite metadata, add to list for this function for later use. 1636 if (!FS->callsites().empty()) 1637 for (auto &SN : FS->mutableCallsites()) 1638 CallsWithMetadata.push_back({&SN}); 1639 1640 if (!CallsWithMetadata.empty()) 1641 FuncToCallsWithMetadata[FS] = CallsWithMetadata; 1642 1643 if (!FS->allocs().empty() || !FS->callsites().empty()) 1644 FSToVIMap[FS] = VI; 1645 } 1646 } 1647 1648 if (DumpCCG) { 1649 dbgs() << "CCG before updating call stack chains:\n"; 1650 dbgs() << *this; 1651 } 1652 1653 if (ExportToDot) 1654 exportToDot("prestackupdate"); 1655 1656 updateStackNodes(); 1657 1658 handleCallsitesWithMultipleTargets(); 1659 } 1660 1661 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1662 void CallsiteContextGraph<DerivedCCG, FuncTy, 1663 CallTy>::handleCallsitesWithMultipleTargets() { 1664 // Look for and workaround callsites that call multiple functions. 1665 // This can happen for indirect calls, which needs better handling, and in 1666 // more rare cases (e.g. macro expansion). 1667 // TODO: To fix this for indirect calls we will want to perform speculative 1668 // devirtualization using either the normal PGO info with ICP, or using the 1669 // information in the profiled MemProf contexts. We can do this prior to 1670 // this transformation for regular LTO, and for ThinLTO we can simulate that 1671 // effect in the summary and perform the actual speculative devirtualization 1672 // while cloning in the ThinLTO backend. 1673 1674 // Keep track of the new nodes synthesized for discovered tail calls missing 1675 // from the profiled contexts. 1676 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap; 1677 1678 for (auto Entry = NonAllocationCallToContextNodeMap.begin(); 1679 Entry != NonAllocationCallToContextNodeMap.end();) { 1680 auto *Node = Entry->second; 1681 assert(Node->Clones.empty()); 1682 // Check all node callees and see if in the same function. 1683 bool Removed = false; 1684 auto Call = Node->Call.call(); 1685 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) { 1686 auto Edge = *EI; 1687 if (!Edge->Callee->hasCall()) { 1688 ++EI; 1689 continue; 1690 } 1691 assert(NodeToCallingFunc.count(Edge->Callee)); 1692 // Check if the called function matches that of the callee node. 1693 if (calleesMatch(Call, EI, TailCallToContextNodeMap)) 1694 continue; 1695 RemovedEdgesWithMismatchedCallees++; 1696 // Work around by setting Node to have a null call, so it gets 1697 // skipped during cloning. Otherwise assignFunctions will assert 1698 // because its data structures are not designed to handle this case. 1699 Entry = NonAllocationCallToContextNodeMap.erase(Entry); 1700 Node->setCall(CallInfo()); 1701 Removed = true; 1702 break; 1703 } 1704 if (!Removed) 1705 Entry++; 1706 } 1707 1708 // Add the new nodes after the above loop so that the iteration is not 1709 // invalidated. 1710 for (auto &[Call, Node] : TailCallToContextNodeMap) 1711 NonAllocationCallToContextNodeMap[Call] = Node; 1712 } 1713 1714 uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const { 1715 // In the Module (IR) case this is already the Id. 1716 return IdOrIndex; 1717 } 1718 1719 uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const { 1720 // In the Index case this is an index into the stack id list in the summary 1721 // index, convert it to an Id. 1722 return Index.getStackIdAtIndex(IdOrIndex); 1723 } 1724 1725 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1726 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch( 1727 CallTy Call, EdgeIter &EI, 1728 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) { 1729 auto Edge = *EI; 1730 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee]; 1731 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller]; 1732 // Will be populated in order of callee to caller if we find a chain of tail 1733 // calls between the profiled caller and callee. 1734 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain; 1735 if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc, 1736 FoundCalleeChain)) { 1737 ++EI; 1738 return false; 1739 } 1740 1741 // The usual case where the profiled callee matches that of the IR/summary. 1742 if (FoundCalleeChain.empty()) { 1743 ++EI; 1744 return true; 1745 } 1746 1747 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) { 1748 auto *CurEdge = Callee->findEdgeFromCaller(Caller); 1749 // If there is already an edge between these nodes, simply update it and 1750 // return. 1751 if (CurEdge) { 1752 CurEdge->ContextIds.insert(Edge->ContextIds.begin(), 1753 Edge->ContextIds.end()); 1754 CurEdge->AllocTypes |= Edge->AllocTypes; 1755 return; 1756 } 1757 // Otherwise, create a new edge and insert it into the caller and callee 1758 // lists. 1759 auto NewEdge = std::make_shared<ContextEdge>( 1760 Callee, Caller, Edge->AllocTypes, Edge->ContextIds); 1761 Callee->CallerEdges.push_back(NewEdge); 1762 if (Caller == Edge->Caller) { 1763 // If we are inserting the new edge into the current edge's caller, insert 1764 // the new edge before the current iterator position, and then increment 1765 // back to the current edge. 1766 EI = Caller->CalleeEdges.insert(EI, NewEdge); 1767 ++EI; 1768 assert(*EI == Edge && 1769 "Iterator position not restored after insert and increment"); 1770 } else 1771 Caller->CalleeEdges.push_back(NewEdge); 1772 }; 1773 1774 // Create new nodes for each found callee and connect in between the profiled 1775 // caller and callee. 1776 auto *CurCalleeNode = Edge->Callee; 1777 for (auto &[NewCall, Func] : FoundCalleeChain) { 1778 ContextNode *NewNode = nullptr; 1779 // First check if we have already synthesized a node for this tail call. 1780 if (TailCallToContextNodeMap.count(NewCall)) { 1781 NewNode = TailCallToContextNodeMap[NewCall]; 1782 NewNode->ContextIds.insert(Edge->ContextIds.begin(), 1783 Edge->ContextIds.end()); 1784 NewNode->AllocTypes |= Edge->AllocTypes; 1785 } else { 1786 FuncToCallsWithMetadata[Func].push_back({NewCall}); 1787 // Create Node and record node info. 1788 NodeOwner.push_back( 1789 std::make_unique<ContextNode>(/*IsAllocation=*/false, NewCall)); 1790 NewNode = NodeOwner.back().get(); 1791 NodeToCallingFunc[NewNode] = Func; 1792 TailCallToContextNodeMap[NewCall] = NewNode; 1793 NewNode->ContextIds = Edge->ContextIds; 1794 NewNode->AllocTypes = Edge->AllocTypes; 1795 } 1796 1797 // Hook up node to its callee node 1798 AddEdge(NewNode, CurCalleeNode); 1799 1800 CurCalleeNode = NewNode; 1801 } 1802 1803 // Hook up edge's original caller to new callee node. 1804 AddEdge(Edge->Caller, CurCalleeNode); 1805 1806 // Remove old edge 1807 Edge->Callee->eraseCallerEdge(Edge.get()); 1808 EI = Edge->Caller->CalleeEdges.erase(EI); 1809 1810 return true; 1811 } 1812 1813 bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls( 1814 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth, 1815 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain, 1816 bool &FoundMultipleCalleeChains) { 1817 // Stop recursive search if we have already explored the maximum specified 1818 // depth. 1819 if (Depth > TailCallSearchDepth) 1820 return false; 1821 1822 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) { 1823 FoundCalleeChain.push_back({Callsite, F}); 1824 }; 1825 1826 auto *CalleeFunc = dyn_cast<Function>(CurCallee); 1827 if (!CalleeFunc) { 1828 auto *Alias = dyn_cast<GlobalAlias>(CurCallee); 1829 assert(Alias); 1830 CalleeFunc = dyn_cast<Function>(Alias->getAliasee()); 1831 assert(CalleeFunc); 1832 } 1833 1834 // Look for tail calls in this function, and check if they either call the 1835 // profiled callee directly, or indirectly (via a recursive search). 1836 // Only succeed if there is a single unique tail call chain found between the 1837 // profiled caller and callee, otherwise we could perform incorrect cloning. 1838 bool FoundSingleCalleeChain = false; 1839 for (auto &BB : *CalleeFunc) { 1840 for (auto &I : BB) { 1841 auto *CB = dyn_cast<CallBase>(&I); 1842 if (!CB || !CB->isTailCall()) 1843 continue; 1844 auto *CalledValue = CB->getCalledOperand(); 1845 auto *CalledFunction = CB->getCalledFunction(); 1846 if (CalledValue && !CalledFunction) { 1847 CalledValue = CalledValue->stripPointerCasts(); 1848 // Stripping pointer casts can reveal a called function. 1849 CalledFunction = dyn_cast<Function>(CalledValue); 1850 } 1851 // Check if this is an alias to a function. If so, get the 1852 // called aliasee for the checks below. 1853 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) { 1854 assert(!CalledFunction && 1855 "Expected null called function in callsite for alias"); 1856 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject()); 1857 } 1858 if (!CalledFunction) 1859 continue; 1860 if (CalledFunction == ProfiledCallee) { 1861 if (FoundSingleCalleeChain) { 1862 FoundMultipleCalleeChains = true; 1863 return false; 1864 } 1865 FoundSingleCalleeChain = true; 1866 FoundProfiledCalleeCount++; 1867 FoundProfiledCalleeDepth += Depth; 1868 if (Depth > FoundProfiledCalleeMaxDepth) 1869 FoundProfiledCalleeMaxDepth = Depth; 1870 SaveCallsiteInfo(&I, CalleeFunc); 1871 } else if (findProfiledCalleeThroughTailCalls( 1872 ProfiledCallee, CalledFunction, Depth + 1, 1873 FoundCalleeChain, FoundMultipleCalleeChains)) { 1874 if (FoundMultipleCalleeChains) 1875 return false; 1876 if (FoundSingleCalleeChain) { 1877 FoundMultipleCalleeChains = true; 1878 return false; 1879 } 1880 FoundSingleCalleeChain = true; 1881 SaveCallsiteInfo(&I, CalleeFunc); 1882 } 1883 } 1884 } 1885 1886 return FoundSingleCalleeChain; 1887 } 1888 1889 bool ModuleCallsiteContextGraph::calleeMatchesFunc( 1890 Instruction *Call, const Function *Func, const Function *CallerFunc, 1891 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) { 1892 auto *CB = dyn_cast<CallBase>(Call); 1893 if (!CB->getCalledOperand()) 1894 return false; 1895 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts(); 1896 auto *CalleeFunc = dyn_cast<Function>(CalleeVal); 1897 if (CalleeFunc == Func) 1898 return true; 1899 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal); 1900 if (Alias && Alias->getAliasee() == Func) 1901 return true; 1902 1903 // Recursively search for the profiled callee through tail calls starting with 1904 // the actual Callee. The discovered tail call chain is saved in 1905 // FoundCalleeChain, and we will fixup the graph to include these callsites 1906 // after returning. 1907 // FIXME: We will currently redo the same recursive walk if we find the same 1908 // mismatched callee from another callsite. We can improve this with more 1909 // bookkeeping of the created chain of new nodes for each mismatch. 1910 unsigned Depth = 1; 1911 bool FoundMultipleCalleeChains = false; 1912 if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth, 1913 FoundCalleeChain, 1914 FoundMultipleCalleeChains)) { 1915 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " 1916 << Func->getName() << " from " << CallerFunc->getName() 1917 << " that actually called " << CalleeVal->getName() 1918 << (FoundMultipleCalleeChains 1919 ? " (found multiple possible chains)" 1920 : "") 1921 << "\n"); 1922 if (FoundMultipleCalleeChains) 1923 FoundProfiledCalleeNonUniquelyCount++; 1924 return false; 1925 } 1926 1927 return true; 1928 } 1929 1930 bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls( 1931 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth, 1932 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain, 1933 bool &FoundMultipleCalleeChains) { 1934 // Stop recursive search if we have already explored the maximum specified 1935 // depth. 1936 if (Depth > TailCallSearchDepth) 1937 return false; 1938 1939 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) { 1940 // Make a CallsiteInfo for each discovered callee, if one hasn't already 1941 // been synthesized. 1942 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) || 1943 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee)) 1944 // StackIds is empty (we don't have debug info available in the index for 1945 // these callsites) 1946 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] = 1947 std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>()); 1948 CallsiteInfo *NewCallsiteInfo = 1949 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get(); 1950 FoundCalleeChain.push_back({NewCallsiteInfo, FS}); 1951 }; 1952 1953 // Look for tail calls in this function, and check if they either call the 1954 // profiled callee directly, or indirectly (via a recursive search). 1955 // Only succeed if there is a single unique tail call chain found between the 1956 // profiled caller and callee, otherwise we could perform incorrect cloning. 1957 bool FoundSingleCalleeChain = false; 1958 for (auto &S : CurCallee.getSummaryList()) { 1959 if (!GlobalValue::isLocalLinkage(S->linkage()) && 1960 !isPrevailing(CurCallee.getGUID(), S.get())) 1961 continue; 1962 auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject()); 1963 if (!FS) 1964 continue; 1965 auto FSVI = CurCallee; 1966 auto *AS = dyn_cast<AliasSummary>(S.get()); 1967 if (AS) 1968 FSVI = AS->getAliaseeVI(); 1969 for (auto &CallEdge : FS->calls()) { 1970 if (!CallEdge.second.hasTailCall()) 1971 continue; 1972 if (CallEdge.first == ProfiledCallee) { 1973 if (FoundSingleCalleeChain) { 1974 FoundMultipleCalleeChains = true; 1975 return false; 1976 } 1977 FoundSingleCalleeChain = true; 1978 FoundProfiledCalleeCount++; 1979 FoundProfiledCalleeDepth += Depth; 1980 if (Depth > FoundProfiledCalleeMaxDepth) 1981 FoundProfiledCalleeMaxDepth = Depth; 1982 CreateAndSaveCallsiteInfo(CallEdge.first, FS); 1983 // Add FS to FSToVIMap in case it isn't already there. 1984 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI); 1985 FSToVIMap[FS] = FSVI; 1986 } else if (findProfiledCalleeThroughTailCalls( 1987 ProfiledCallee, CallEdge.first, Depth + 1, 1988 FoundCalleeChain, FoundMultipleCalleeChains)) { 1989 if (FoundMultipleCalleeChains) 1990 return false; 1991 if (FoundSingleCalleeChain) { 1992 FoundMultipleCalleeChains = true; 1993 return false; 1994 } 1995 FoundSingleCalleeChain = true; 1996 CreateAndSaveCallsiteInfo(CallEdge.first, FS); 1997 // Add FS to FSToVIMap in case it isn't already there. 1998 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI); 1999 FSToVIMap[FS] = FSVI; 2000 } 2001 } 2002 } 2003 2004 return FoundSingleCalleeChain; 2005 } 2006 2007 bool IndexCallsiteContextGraph::calleeMatchesFunc( 2008 IndexCall &Call, const FunctionSummary *Func, 2009 const FunctionSummary *CallerFunc, 2010 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) { 2011 ValueInfo Callee = 2012 dyn_cast_if_present<CallsiteInfo *>(Call.getBase())->Callee; 2013 // If there is no summary list then this is a call to an externally defined 2014 // symbol. 2015 AliasSummary *Alias = 2016 Callee.getSummaryList().empty() 2017 ? nullptr 2018 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get()); 2019 assert(FSToVIMap.count(Func)); 2020 auto FuncVI = FSToVIMap[Func]; 2021 if (Callee == FuncVI || 2022 // If callee is an alias, check the aliasee, since only function 2023 // summary base objects will contain the stack node summaries and thus 2024 // get a context node. 2025 (Alias && Alias->getAliaseeVI() == FuncVI)) 2026 return true; 2027 2028 // Recursively search for the profiled callee through tail calls starting with 2029 // the actual Callee. The discovered tail call chain is saved in 2030 // FoundCalleeChain, and we will fixup the graph to include these callsites 2031 // after returning. 2032 // FIXME: We will currently redo the same recursive walk if we find the same 2033 // mismatched callee from another callsite. We can improve this with more 2034 // bookkeeping of the created chain of new nodes for each mismatch. 2035 unsigned Depth = 1; 2036 bool FoundMultipleCalleeChains = false; 2037 if (!findProfiledCalleeThroughTailCalls( 2038 FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) { 2039 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI 2040 << " from " << FSToVIMap[CallerFunc] 2041 << " that actually called " << Callee 2042 << (FoundMultipleCalleeChains 2043 ? " (found multiple possible chains)" 2044 : "") 2045 << "\n"); 2046 if (FoundMultipleCalleeChains) 2047 FoundProfiledCalleeNonUniquelyCount++; 2048 return false; 2049 } 2050 2051 return true; 2052 } 2053 2054 static std::string getAllocTypeString(uint8_t AllocTypes) { 2055 if (!AllocTypes) 2056 return "None"; 2057 std::string Str; 2058 if (AllocTypes & (uint8_t)AllocationType::NotCold) 2059 Str += "NotCold"; 2060 if (AllocTypes & (uint8_t)AllocationType::Cold) 2061 Str += "Cold"; 2062 return Str; 2063 } 2064 2065 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2066 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump() 2067 const { 2068 print(dbgs()); 2069 dbgs() << "\n"; 2070 } 2071 2072 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2073 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print( 2074 raw_ostream &OS) const { 2075 OS << "Node " << this << "\n"; 2076 OS << "\t"; 2077 printCall(OS); 2078 if (Recursive) 2079 OS << " (recursive)"; 2080 OS << "\n"; 2081 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n"; 2082 OS << "\tContextIds:"; 2083 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); 2084 std::sort(SortedIds.begin(), SortedIds.end()); 2085 for (auto Id : SortedIds) 2086 OS << " " << Id; 2087 OS << "\n"; 2088 OS << "\tCalleeEdges:\n"; 2089 for (auto &Edge : CalleeEdges) 2090 OS << "\t\t" << *Edge << "\n"; 2091 OS << "\tCallerEdges:\n"; 2092 for (auto &Edge : CallerEdges) 2093 OS << "\t\t" << *Edge << "\n"; 2094 if (!Clones.empty()) { 2095 OS << "\tClones: "; 2096 FieldSeparator FS; 2097 for (auto *Clone : Clones) 2098 OS << FS << Clone; 2099 OS << "\n"; 2100 } else if (CloneOf) { 2101 OS << "\tClone of " << CloneOf << "\n"; 2102 } 2103 } 2104 2105 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2106 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump() 2107 const { 2108 print(dbgs()); 2109 dbgs() << "\n"; 2110 } 2111 2112 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2113 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print( 2114 raw_ostream &OS) const { 2115 OS << "Edge from Callee " << Callee << " to Caller: " << Caller 2116 << " AllocTypes: " << getAllocTypeString(AllocTypes); 2117 OS << " ContextIds:"; 2118 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); 2119 std::sort(SortedIds.begin(), SortedIds.end()); 2120 for (auto Id : SortedIds) 2121 OS << " " << Id; 2122 } 2123 2124 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2125 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const { 2126 print(dbgs()); 2127 } 2128 2129 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2130 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print( 2131 raw_ostream &OS) const { 2132 OS << "Callsite Context Graph:\n"; 2133 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; 2134 for (const auto Node : nodes<GraphType>(this)) { 2135 if (Node->isRemoved()) 2136 continue; 2137 Node->print(OS); 2138 OS << "\n"; 2139 } 2140 } 2141 2142 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2143 static void checkEdge( 2144 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) { 2145 // Confirm that alloc type is not None and that we have at least one context 2146 // id. 2147 assert(Edge->AllocTypes != (uint8_t)AllocationType::None); 2148 assert(!Edge->ContextIds.empty()); 2149 } 2150 2151 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2152 static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node, 2153 bool CheckEdges = true) { 2154 if (Node->isRemoved()) 2155 return; 2156 // Node's context ids should be the union of both its callee and caller edge 2157 // context ids. 2158 if (Node->CallerEdges.size()) { 2159 auto EI = Node->CallerEdges.begin(); 2160 auto &FirstEdge = *EI; 2161 EI++; 2162 DenseSet<uint32_t> CallerEdgeContextIds(FirstEdge->ContextIds); 2163 for (; EI != Node->CallerEdges.end(); EI++) { 2164 const auto &Edge = *EI; 2165 if (CheckEdges) 2166 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); 2167 set_union(CallerEdgeContextIds, Edge->ContextIds); 2168 } 2169 // Node can have more context ids than callers if some contexts terminate at 2170 // node and some are longer. 2171 assert(Node->ContextIds == CallerEdgeContextIds || 2172 set_is_subset(CallerEdgeContextIds, Node->ContextIds)); 2173 } 2174 if (Node->CalleeEdges.size()) { 2175 auto EI = Node->CalleeEdges.begin(); 2176 auto &FirstEdge = *EI; 2177 EI++; 2178 DenseSet<uint32_t> CalleeEdgeContextIds(FirstEdge->ContextIds); 2179 for (; EI != Node->CalleeEdges.end(); EI++) { 2180 const auto &Edge = *EI; 2181 if (CheckEdges) 2182 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); 2183 set_union(CalleeEdgeContextIds, Edge->ContextIds); 2184 } 2185 assert(Node->ContextIds == CalleeEdgeContextIds); 2186 } 2187 } 2188 2189 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2190 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const { 2191 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; 2192 for (const auto Node : nodes<GraphType>(this)) { 2193 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false); 2194 for (auto &Edge : Node->CallerEdges) 2195 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); 2196 } 2197 } 2198 2199 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2200 struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> { 2201 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; 2202 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *; 2203 2204 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>; 2205 static NodeRef getNode(const NodePtrTy &P) { return P.get(); } 2206 2207 using nodes_iterator = 2208 mapped_iterator<typename std::vector<NodePtrTy>::const_iterator, 2209 decltype(&getNode)>; 2210 2211 static nodes_iterator nodes_begin(GraphType G) { 2212 return nodes_iterator(G->NodeOwner.begin(), &getNode); 2213 } 2214 2215 static nodes_iterator nodes_end(GraphType G) { 2216 return nodes_iterator(G->NodeOwner.end(), &getNode); 2217 } 2218 2219 static NodeRef getEntryNode(GraphType G) { 2220 return G->NodeOwner.begin()->get(); 2221 } 2222 2223 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>; 2224 static const ContextNode<DerivedCCG, FuncTy, CallTy> * 2225 GetCallee(const EdgePtrTy &P) { 2226 return P->Callee; 2227 } 2228 2229 using ChildIteratorType = 2230 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge< 2231 DerivedCCG, FuncTy, CallTy>>>::const_iterator, 2232 decltype(&GetCallee)>; 2233 2234 static ChildIteratorType child_begin(NodeRef N) { 2235 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee); 2236 } 2237 2238 static ChildIteratorType child_end(NodeRef N) { 2239 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee); 2240 } 2241 }; 2242 2243 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2244 struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> 2245 : public DefaultDOTGraphTraits { 2246 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {} 2247 2248 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; 2249 using GTraits = GraphTraits<GraphType>; 2250 using NodeRef = typename GTraits::NodeRef; 2251 using ChildIteratorType = typename GTraits::ChildIteratorType; 2252 2253 static std::string getNodeLabel(NodeRef Node, GraphType G) { 2254 std::string LabelString = 2255 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") + 2256 Twine(Node->OrigStackOrAllocId)) 2257 .str(); 2258 LabelString += "\n"; 2259 if (Node->hasCall()) { 2260 auto Func = G->NodeToCallingFunc.find(Node); 2261 assert(Func != G->NodeToCallingFunc.end()); 2262 LabelString += 2263 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo()); 2264 } else { 2265 LabelString += "null call"; 2266 if (Node->Recursive) 2267 LabelString += " (recursive)"; 2268 else 2269 LabelString += " (external)"; 2270 } 2271 return LabelString; 2272 } 2273 2274 static std::string getNodeAttributes(NodeRef Node, GraphType) { 2275 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " + 2276 getContextIds(Node->ContextIds) + "\"") 2277 .str(); 2278 AttributeString += 2279 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str(); 2280 AttributeString += ",style=\"filled\""; 2281 if (Node->CloneOf) { 2282 AttributeString += ",color=\"blue\""; 2283 AttributeString += ",style=\"filled,bold,dashed\""; 2284 } else 2285 AttributeString += ",style=\"filled\""; 2286 return AttributeString; 2287 } 2288 2289 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, 2290 GraphType) { 2291 auto &Edge = *(ChildIter.getCurrent()); 2292 return (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" + 2293 Twine(",fillcolor=\"") + getColor(Edge->AllocTypes) + "\"") 2294 .str(); 2295 } 2296 2297 // Since the NodeOwners list includes nodes that are no longer connected to 2298 // the graph, skip them here. 2299 static bool isNodeHidden(NodeRef Node, GraphType) { 2300 return Node->isRemoved(); 2301 } 2302 2303 private: 2304 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) { 2305 std::string IdString = "ContextIds:"; 2306 if (ContextIds.size() < 100) { 2307 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); 2308 std::sort(SortedIds.begin(), SortedIds.end()); 2309 for (auto Id : SortedIds) 2310 IdString += (" " + Twine(Id)).str(); 2311 } else { 2312 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str(); 2313 } 2314 return IdString; 2315 } 2316 2317 static std::string getColor(uint8_t AllocTypes) { 2318 if (AllocTypes == (uint8_t)AllocationType::NotCold) 2319 // Color "brown1" actually looks like a lighter red. 2320 return "brown1"; 2321 if (AllocTypes == (uint8_t)AllocationType::Cold) 2322 return "cyan"; 2323 if (AllocTypes == 2324 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold)) 2325 // Lighter purple. 2326 return "mediumorchid1"; 2327 return "gray"; 2328 } 2329 2330 static std::string getNodeId(NodeRef Node) { 2331 std::stringstream SStream; 2332 SStream << std::hex << "N0x" << (unsigned long long)Node; 2333 std::string Result = SStream.str(); 2334 return Result; 2335 } 2336 }; 2337 2338 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2339 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot( 2340 std::string Label) const { 2341 WriteGraph(this, "", false, Label, 2342 DotFilePathPrefix + "ccg." + Label + ".dot"); 2343 } 2344 2345 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2346 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * 2347 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone( 2348 const std::shared_ptr<ContextEdge> &Edge, EdgeIter *CallerEdgeI) { 2349 ContextNode *Node = Edge->Callee; 2350 NodeOwner.push_back( 2351 std::make_unique<ContextNode>(Node->IsAllocation, Node->Call)); 2352 ContextNode *Clone = NodeOwner.back().get(); 2353 Node->addClone(Clone); 2354 assert(NodeToCallingFunc.count(Node)); 2355 NodeToCallingFunc[Clone] = NodeToCallingFunc[Node]; 2356 moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true); 2357 return Clone; 2358 } 2359 2360 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2361 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: 2362 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge, 2363 ContextNode *NewCallee, EdgeIter *CallerEdgeI, 2364 bool NewClone) { 2365 // NewCallee and Edge's current callee must be clones of the same original 2366 // node (Edge's current callee may be the original node too). 2367 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode()); 2368 auto &EdgeContextIds = Edge->getContextIds(); 2369 ContextNode *OldCallee = Edge->Callee; 2370 if (CallerEdgeI) 2371 *CallerEdgeI = OldCallee->CallerEdges.erase(*CallerEdgeI); 2372 else 2373 OldCallee->eraseCallerEdge(Edge.get()); 2374 Edge->Callee = NewCallee; 2375 NewCallee->CallerEdges.push_back(Edge); 2376 // Don't need to update Edge's context ids since we are simply reconnecting 2377 // it. 2378 set_subtract(OldCallee->ContextIds, EdgeContextIds); 2379 NewCallee->ContextIds.insert(EdgeContextIds.begin(), EdgeContextIds.end()); 2380 NewCallee->AllocTypes |= Edge->AllocTypes; 2381 OldCallee->AllocTypes = computeAllocType(OldCallee->ContextIds); 2382 // OldCallee alloc type should be None iff its context id set is now empty. 2383 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) == 2384 OldCallee->ContextIds.empty()); 2385 // Now walk the old callee node's callee edges and move Edge's context ids 2386 // over to the corresponding edge into the clone (which is created here if 2387 // this is a newly created clone). 2388 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) { 2389 // The context ids moving to the new callee are the subset of this edge's 2390 // context ids and the context ids on the caller edge being moved. 2391 DenseSet<uint32_t> EdgeContextIdsToMove = 2392 set_intersection(OldCalleeEdge->getContextIds(), EdgeContextIds); 2393 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove); 2394 OldCalleeEdge->AllocTypes = 2395 computeAllocType(OldCalleeEdge->getContextIds()); 2396 if (!NewClone) { 2397 // Update context ids / alloc type on corresponding edge to NewCallee. 2398 // There is a chance this may not exist if we are reusing an existing 2399 // clone, specifically during function assignment, where we would have 2400 // removed none type edges after creating the clone. If we can't find 2401 // a corresponding edge there, fall through to the cloning below. 2402 if (auto *NewCalleeEdge = 2403 NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) { 2404 NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(), 2405 EdgeContextIdsToMove.end()); 2406 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove); 2407 continue; 2408 } 2409 } 2410 auto NewEdge = std::make_shared<ContextEdge>( 2411 OldCalleeEdge->Callee, NewCallee, 2412 computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove); 2413 NewCallee->CalleeEdges.push_back(NewEdge); 2414 NewEdge->Callee->CallerEdges.push_back(NewEdge); 2415 } 2416 if (VerifyCCG) { 2417 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false); 2418 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false); 2419 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges) 2420 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee, 2421 /*CheckEdges=*/false); 2422 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges) 2423 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee, 2424 /*CheckEdges=*/false); 2425 } 2426 } 2427 2428 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2429 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() { 2430 DenseSet<const ContextNode *> Visited; 2431 for (auto &Entry : AllocationCallToContextNodeMap) 2432 identifyClones(Entry.second, Visited); 2433 } 2434 2435 // helper function to check an AllocType is cold or notcold or both. 2436 bool checkColdOrNotCold(uint8_t AllocType) { 2437 return (AllocType == (uint8_t)AllocationType::Cold) || 2438 (AllocType == (uint8_t)AllocationType::NotCold) || 2439 (AllocType == 2440 ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold)); 2441 } 2442 2443 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2444 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones( 2445 ContextNode *Node, DenseSet<const ContextNode *> &Visited) { 2446 if (VerifyNodes) 2447 checkNode<DerivedCCG, FuncTy, CallTy>(Node); 2448 assert(!Node->CloneOf); 2449 2450 // If Node as a null call, then either it wasn't found in the module (regular 2451 // LTO) or summary index (ThinLTO), or there were other conditions blocking 2452 // cloning (e.g. recursion, calls multiple targets, etc). 2453 // Do this here so that we don't try to recursively clone callers below, which 2454 // isn't useful at least for this node. 2455 if (!Node->hasCall()) 2456 return; 2457 2458 #ifndef NDEBUG 2459 auto Insert = 2460 #endif 2461 Visited.insert(Node); 2462 // We should not have visited this node yet. 2463 assert(Insert.second); 2464 // The recursive call to identifyClones may delete the current edge from the 2465 // CallerEdges vector. Make a copy and iterate on that, simpler than passing 2466 // in an iterator and having recursive call erase from it. Other edges may 2467 // also get removed during the recursion, which will have null Callee and 2468 // Caller pointers (and are deleted later), so we skip those below. 2469 { 2470 auto CallerEdges = Node->CallerEdges; 2471 for (auto &Edge : CallerEdges) { 2472 // Skip any that have been removed by an earlier recursive call. 2473 if (Edge->Callee == nullptr && Edge->Caller == nullptr) { 2474 assert(!llvm::count(Node->CallerEdges, Edge)); 2475 continue; 2476 } 2477 // Ignore any caller we previously visited via another edge. 2478 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) { 2479 identifyClones(Edge->Caller, Visited); 2480 } 2481 } 2482 } 2483 2484 // Check if we reached an unambiguous call or have have only a single caller. 2485 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1) 2486 return; 2487 2488 // We need to clone. 2489 2490 // Try to keep the original version as alloc type NotCold. This will make 2491 // cases with indirect calls or any other situation with an unknown call to 2492 // the original function get the default behavior. We do this by sorting the 2493 // CallerEdges of the Node we will clone by alloc type. 2494 // 2495 // Give NotCold edge the lowest sort priority so those edges are at the end of 2496 // the caller edges vector, and stay on the original version (since the below 2497 // code clones greedily until it finds all remaining edges have the same type 2498 // and leaves the remaining ones on the original Node). 2499 // 2500 // We shouldn't actually have any None type edges, so the sorting priority for 2501 // that is arbitrary, and we assert in that case below. 2502 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4, 2503 /*Cold*/ 1, 2504 /*NotColdCold*/ 2}; 2505 std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(), 2506 [&](const std::shared_ptr<ContextEdge> &A, 2507 const std::shared_ptr<ContextEdge> &B) { 2508 assert(checkColdOrNotCold(A->AllocTypes) && 2509 checkColdOrNotCold(B->AllocTypes)); 2510 2511 if (A->AllocTypes == B->AllocTypes) 2512 // Use the first context id for each edge as a 2513 // tie-breaker. 2514 return *A->ContextIds.begin() < *B->ContextIds.begin(); 2515 return AllocTypeCloningPriority[A->AllocTypes] < 2516 AllocTypeCloningPriority[B->AllocTypes]; 2517 }); 2518 2519 assert(Node->AllocTypes != (uint8_t)AllocationType::None); 2520 2521 // Iterate until we find no more opportunities for disambiguating the alloc 2522 // types via cloning. In most cases this loop will terminate once the Node 2523 // has a single allocation type, in which case no more cloning is needed. 2524 // We need to be able to remove Edge from CallerEdges, so need to adjust 2525 // iterator inside the loop. 2526 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) { 2527 auto CallerEdge = *EI; 2528 2529 // See if cloning the prior caller edge left this node with a single alloc 2530 // type or a single caller. In that case no more cloning of Node is needed. 2531 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1) 2532 break; 2533 2534 // Compute the node callee edge alloc types corresponding to the context ids 2535 // for this caller edge. 2536 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge; 2537 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size()); 2538 for (auto &CalleeEdge : Node->CalleeEdges) 2539 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes( 2540 CalleeEdge->getContextIds(), CallerEdge->getContextIds())); 2541 2542 // Don't clone if doing so will not disambiguate any alloc types amongst 2543 // caller edges (including the callee edges that would be cloned). 2544 // Otherwise we will simply move all edges to the clone. 2545 // 2546 // First check if by cloning we will disambiguate the caller allocation 2547 // type from node's allocation type. Query allocTypeToUse so that we don't 2548 // bother cloning to distinguish NotCold+Cold from NotCold. Note that 2549 // neither of these should be None type. 2550 // 2551 // Then check if by cloning node at least one of the callee edges will be 2552 // disambiguated by splitting out different context ids. 2553 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None); 2554 assert(Node->AllocTypes != (uint8_t)AllocationType::None); 2555 if (allocTypeToUse(CallerEdge->AllocTypes) == 2556 allocTypeToUse(Node->AllocTypes) && 2557 allocTypesMatch<DerivedCCG, FuncTy, CallTy>( 2558 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) { 2559 ++EI; 2560 continue; 2561 } 2562 2563 // First see if we can use an existing clone. Check each clone and its 2564 // callee edges for matching alloc types. 2565 ContextNode *Clone = nullptr; 2566 for (auto *CurClone : Node->Clones) { 2567 if (allocTypeToUse(CurClone->AllocTypes) != 2568 allocTypeToUse(CallerEdge->AllocTypes)) 2569 continue; 2570 2571 if (!allocTypesMatch<DerivedCCG, FuncTy, CallTy>( 2572 CalleeEdgeAllocTypesForCallerEdge, CurClone->CalleeEdges)) 2573 continue; 2574 Clone = CurClone; 2575 break; 2576 } 2577 2578 // The edge iterator is adjusted when we move the CallerEdge to the clone. 2579 if (Clone) 2580 moveEdgeToExistingCalleeClone(CallerEdge, Clone, &EI); 2581 else 2582 Clone = moveEdgeToNewCalleeClone(CallerEdge, &EI); 2583 2584 assert(EI == Node->CallerEdges.end() || 2585 Node->AllocTypes != (uint8_t)AllocationType::None); 2586 // Sanity check that no alloc types on clone or its edges are None. 2587 assert(Clone->AllocTypes != (uint8_t)AllocationType::None); 2588 assert(llvm::none_of( 2589 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) { 2590 return E->AllocTypes == (uint8_t)AllocationType::None; 2591 })); 2592 } 2593 2594 // Cloning may have resulted in some cloned callee edges with type None, 2595 // because they aren't carrying any contexts. Remove those edges. 2596 for (auto *Clone : Node->Clones) { 2597 removeNoneTypeCalleeEdges(Clone); 2598 if (VerifyNodes) 2599 checkNode<DerivedCCG, FuncTy, CallTy>(Clone); 2600 } 2601 // We should still have some context ids on the original Node. 2602 assert(!Node->ContextIds.empty()); 2603 2604 // Remove any callee edges that ended up with alloc type None after creating 2605 // clones and updating callee edges. 2606 removeNoneTypeCalleeEdges(Node); 2607 2608 // Sanity check that no alloc types on node or edges are None. 2609 assert(Node->AllocTypes != (uint8_t)AllocationType::None); 2610 assert(llvm::none_of(Node->CalleeEdges, 2611 [&](const std::shared_ptr<ContextEdge> &E) { 2612 return E->AllocTypes == (uint8_t)AllocationType::None; 2613 })); 2614 assert(llvm::none_of(Node->CallerEdges, 2615 [&](const std::shared_ptr<ContextEdge> &E) { 2616 return E->AllocTypes == (uint8_t)AllocationType::None; 2617 })); 2618 2619 if (VerifyNodes) 2620 checkNode<DerivedCCG, FuncTy, CallTy>(Node); 2621 } 2622 2623 void ModuleCallsiteContextGraph::updateAllocationCall( 2624 CallInfo &Call, AllocationType AllocType) { 2625 std::string AllocTypeString = getAllocTypeAttributeString(AllocType); 2626 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(), 2627 "memprof", AllocTypeString); 2628 cast<CallBase>(Call.call())->addFnAttr(A); 2629 OREGetter(Call.call()->getFunction()) 2630 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call()) 2631 << ore::NV("AllocationCall", Call.call()) << " in clone " 2632 << ore::NV("Caller", Call.call()->getFunction()) 2633 << " marked with memprof allocation attribute " 2634 << ore::NV("Attribute", AllocTypeString)); 2635 } 2636 2637 void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call, 2638 AllocationType AllocType) { 2639 auto *AI = Call.call().dyn_cast<AllocInfo *>(); 2640 assert(AI); 2641 assert(AI->Versions.size() > Call.cloneNo()); 2642 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType; 2643 } 2644 2645 void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall, 2646 FuncInfo CalleeFunc) { 2647 if (CalleeFunc.cloneNo() > 0) 2648 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func()); 2649 OREGetter(CallerCall.call()->getFunction()) 2650 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call()) 2651 << ore::NV("Call", CallerCall.call()) << " in clone " 2652 << ore::NV("Caller", CallerCall.call()->getFunction()) 2653 << " assigned to call function clone " 2654 << ore::NV("Callee", CalleeFunc.func())); 2655 } 2656 2657 void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall, 2658 FuncInfo CalleeFunc) { 2659 auto *CI = CallerCall.call().dyn_cast<CallsiteInfo *>(); 2660 assert(CI && 2661 "Caller cannot be an allocation which should not have profiled calls"); 2662 assert(CI->Clones.size() > CallerCall.cloneNo()); 2663 CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo(); 2664 } 2665 2666 CallsiteContextGraph<ModuleCallsiteContextGraph, Function, 2667 Instruction *>::FuncInfo 2668 ModuleCallsiteContextGraph::cloneFunctionForCallsite( 2669 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, 2670 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { 2671 // Use existing LLVM facilities for cloning and obtaining Call in clone 2672 ValueToValueMapTy VMap; 2673 auto *NewFunc = CloneFunction(Func.func(), VMap); 2674 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo); 2675 assert(!Func.func()->getParent()->getFunction(Name)); 2676 NewFunc->setName(Name); 2677 for (auto &Inst : CallsWithMetadataInFunc) { 2678 // This map always has the initial version in it. 2679 assert(Inst.cloneNo() == 0); 2680 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo}; 2681 } 2682 OREGetter(Func.func()) 2683 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func()) 2684 << "created clone " << ore::NV("NewFunction", NewFunc)); 2685 return {NewFunc, CloneNo}; 2686 } 2687 2688 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, 2689 IndexCall>::FuncInfo 2690 IndexCallsiteContextGraph::cloneFunctionForCallsite( 2691 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, 2692 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { 2693 // Check how many clones we have of Call (and therefore function). 2694 // The next clone number is the current size of versions array. 2695 // Confirm this matches the CloneNo provided by the caller, which is based on 2696 // the number of function clones we have. 2697 assert(CloneNo == 2698 (Call.call().is<AllocInfo *>() 2699 ? Call.call().dyn_cast<AllocInfo *>()->Versions.size() 2700 : Call.call().dyn_cast<CallsiteInfo *>()->Clones.size())); 2701 // Walk all the instructions in this function. Create a new version for 2702 // each (by adding an entry to the Versions/Clones summary array), and copy 2703 // over the version being called for the function clone being cloned here. 2704 // Additionally, add an entry to the CallMap for the new function clone, 2705 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc) 2706 // to the new call clone. 2707 for (auto &Inst : CallsWithMetadataInFunc) { 2708 // This map always has the initial version in it. 2709 assert(Inst.cloneNo() == 0); 2710 if (auto *AI = Inst.call().dyn_cast<AllocInfo *>()) { 2711 assert(AI->Versions.size() == CloneNo); 2712 // We assign the allocation type later (in updateAllocationCall), just add 2713 // an entry for it here. 2714 AI->Versions.push_back(0); 2715 } else { 2716 auto *CI = Inst.call().dyn_cast<CallsiteInfo *>(); 2717 assert(CI && CI->Clones.size() == CloneNo); 2718 // We assign the clone number later (in updateCall), just add an entry for 2719 // it here. 2720 CI->Clones.push_back(0); 2721 } 2722 CallMap[Inst] = {Inst.call(), CloneNo}; 2723 } 2724 return {Func.func(), CloneNo}; 2725 } 2726 2727 // This method assigns cloned callsites to functions, cloning the functions as 2728 // needed. The assignment is greedy and proceeds roughly as follows: 2729 // 2730 // For each function Func: 2731 // For each call with graph Node having clones: 2732 // Initialize ClonesWorklist to Node and its clones 2733 // Initialize NodeCloneCount to 0 2734 // While ClonesWorklist is not empty: 2735 // Clone = pop front ClonesWorklist 2736 // NodeCloneCount++ 2737 // If Func has been cloned less than NodeCloneCount times: 2738 // If NodeCloneCount is 1: 2739 // Assign Clone to original Func 2740 // Continue 2741 // Create a new function clone 2742 // If other callers not assigned to call a function clone yet: 2743 // Assign them to call new function clone 2744 // Continue 2745 // Assign any other caller calling the cloned version to new clone 2746 // 2747 // For each caller of Clone: 2748 // If caller is assigned to call a specific function clone: 2749 // If we cannot assign Clone to that function clone: 2750 // Create new callsite Clone NewClone 2751 // Add NewClone to ClonesWorklist 2752 // Continue 2753 // Assign Clone to existing caller's called function clone 2754 // Else: 2755 // If Clone not already assigned to a function clone: 2756 // Assign to first function clone without assignment 2757 // Assign caller to selected function clone 2758 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2759 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { 2760 bool Changed = false; 2761 2762 // Keep track of the assignment of nodes (callsites) to function clones they 2763 // call. 2764 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap; 2765 2766 // Update caller node to call function version CalleeFunc, by recording the 2767 // assignment in CallsiteToCalleeFuncCloneMap. 2768 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller, 2769 const FuncInfo &CalleeFunc) { 2770 assert(Caller->hasCall()); 2771 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc; 2772 }; 2773 2774 // Walk all functions for which we saw calls with memprof metadata, and handle 2775 // cloning for each of its calls. 2776 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) { 2777 FuncInfo OrigFunc(Func); 2778 // Map from each clone of OrigFunc to a map of remappings of each call of 2779 // interest (from original uncloned call to the corresponding cloned call in 2780 // that function clone). 2781 std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap; 2782 for (auto &Call : CallsWithMetadata) { 2783 ContextNode *Node = getNodeForInst(Call); 2784 // Skip call if we do not have a node for it (all uses of its stack ids 2785 // were either on inlined chains or pruned from the MIBs), or if we did 2786 // not create any clones for it. 2787 if (!Node || Node->Clones.empty()) 2788 continue; 2789 assert(Node->hasCall() && 2790 "Not having a call should have prevented cloning"); 2791 2792 // Track the assignment of function clones to clones of the current 2793 // callsite Node being handled. 2794 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap; 2795 2796 // Assign callsite version CallsiteClone to function version FuncClone, 2797 // and also assign (possibly cloned) Call to CallsiteClone. 2798 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone, 2799 CallInfo &Call, 2800 ContextNode *CallsiteClone, 2801 bool IsAlloc) { 2802 // Record the clone of callsite node assigned to this function clone. 2803 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone; 2804 2805 assert(FuncClonesToCallMap.count(FuncClone)); 2806 std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone]; 2807 CallInfo CallClone(Call); 2808 if (CallMap.count(Call)) 2809 CallClone = CallMap[Call]; 2810 CallsiteClone->setCall(CallClone); 2811 }; 2812 2813 // Keep track of the clones of callsite Node that need to be assigned to 2814 // function clones. This list may be expanded in the loop body below if we 2815 // find additional cloning is required. 2816 std::deque<ContextNode *> ClonesWorklist; 2817 // Ignore original Node if we moved all of its contexts to clones. 2818 if (!Node->ContextIds.empty()) 2819 ClonesWorklist.push_back(Node); 2820 ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(), 2821 Node->Clones.end()); 2822 2823 // Now walk through all of the clones of this callsite Node that we need, 2824 // and determine the assignment to a corresponding clone of the current 2825 // function (creating new function clones as needed). 2826 unsigned NodeCloneCount = 0; 2827 while (!ClonesWorklist.empty()) { 2828 ContextNode *Clone = ClonesWorklist.front(); 2829 ClonesWorklist.pop_front(); 2830 NodeCloneCount++; 2831 if (VerifyNodes) 2832 checkNode<DerivedCCG, FuncTy, CallTy>(Clone); 2833 2834 // Need to create a new function clone if we have more callsite clones 2835 // than existing function clones, which would have been assigned to an 2836 // earlier clone in the list (we assign callsite clones to function 2837 // clones greedily). 2838 if (FuncClonesToCallMap.size() < NodeCloneCount) { 2839 // If this is the first callsite copy, assign to original function. 2840 if (NodeCloneCount == 1) { 2841 // Since FuncClonesToCallMap is empty in this case, no clones have 2842 // been created for this function yet, and no callers should have 2843 // been assigned a function clone for this callee node yet. 2844 assert(llvm::none_of( 2845 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) { 2846 return CallsiteToCalleeFuncCloneMap.count(E->Caller); 2847 })); 2848 // Initialize with empty call map, assign Clone to original function 2849 // and its callers, and skip to the next clone. 2850 FuncClonesToCallMap[OrigFunc] = {}; 2851 AssignCallsiteCloneToFuncClone( 2852 OrigFunc, Call, Clone, 2853 AllocationCallToContextNodeMap.count(Call)); 2854 for (auto &CE : Clone->CallerEdges) { 2855 // Ignore any caller that does not have a recorded callsite Call. 2856 if (!CE->Caller->hasCall()) 2857 continue; 2858 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc); 2859 } 2860 continue; 2861 } 2862 2863 // First locate which copy of OrigFunc to clone again. If a caller 2864 // of this callsite clone was already assigned to call a particular 2865 // function clone, we need to redirect all of those callers to the 2866 // new function clone, and update their other callees within this 2867 // function. 2868 FuncInfo PreviousAssignedFuncClone; 2869 auto EI = llvm::find_if( 2870 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) { 2871 return CallsiteToCalleeFuncCloneMap.count(E->Caller); 2872 }); 2873 bool CallerAssignedToCloneOfFunc = false; 2874 if (EI != Clone->CallerEdges.end()) { 2875 const std::shared_ptr<ContextEdge> &Edge = *EI; 2876 PreviousAssignedFuncClone = 2877 CallsiteToCalleeFuncCloneMap[Edge->Caller]; 2878 CallerAssignedToCloneOfFunc = true; 2879 } 2880 2881 // Clone function and save it along with the CallInfo map created 2882 // during cloning in the FuncClonesToCallMap. 2883 std::map<CallInfo, CallInfo> NewCallMap; 2884 unsigned CloneNo = FuncClonesToCallMap.size(); 2885 assert(CloneNo > 0 && "Clone 0 is the original function, which " 2886 "should already exist in the map"); 2887 FuncInfo NewFuncClone = cloneFunctionForCallsite( 2888 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo); 2889 FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap)); 2890 FunctionClonesAnalysis++; 2891 Changed = true; 2892 2893 // If no caller callsites were already assigned to a clone of this 2894 // function, we can simply assign this clone to the new func clone 2895 // and update all callers to it, then skip to the next clone. 2896 if (!CallerAssignedToCloneOfFunc) { 2897 AssignCallsiteCloneToFuncClone( 2898 NewFuncClone, Call, Clone, 2899 AllocationCallToContextNodeMap.count(Call)); 2900 for (auto &CE : Clone->CallerEdges) { 2901 // Ignore any caller that does not have a recorded callsite Call. 2902 if (!CE->Caller->hasCall()) 2903 continue; 2904 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone); 2905 } 2906 continue; 2907 } 2908 2909 // We may need to do additional node cloning in this case. 2910 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers 2911 // that were previously assigned to call PreviousAssignedFuncClone, 2912 // to record that they now call NewFuncClone. 2913 for (auto CE : Clone->CallerEdges) { 2914 // Skip any that have been removed on an earlier iteration. 2915 if (!CE) 2916 continue; 2917 // Ignore any caller that does not have a recorded callsite Call. 2918 if (!CE->Caller->hasCall()) 2919 continue; 2920 2921 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) || 2922 // We subsequently fall through to later handling that 2923 // will perform any additional cloning required for 2924 // callers that were calling other function clones. 2925 CallsiteToCalleeFuncCloneMap[CE->Caller] != 2926 PreviousAssignedFuncClone) 2927 continue; 2928 2929 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone); 2930 2931 // If we are cloning a function that was already assigned to some 2932 // callers, then essentially we are creating new callsite clones 2933 // of the other callsites in that function that are reached by those 2934 // callers. Clone the other callees of the current callsite's caller 2935 // that were already assigned to PreviousAssignedFuncClone 2936 // accordingly. This is important since we subsequently update the 2937 // calls from the nodes in the graph and their assignments to callee 2938 // functions recorded in CallsiteToCalleeFuncCloneMap. 2939 for (auto CalleeEdge : CE->Caller->CalleeEdges) { 2940 // Skip any that have been removed on an earlier iteration when 2941 // cleaning up newly None type callee edges. 2942 if (!CalleeEdge) 2943 continue; 2944 ContextNode *Callee = CalleeEdge->Callee; 2945 // Skip the current callsite, we are looking for other 2946 // callsites Caller calls, as well as any that does not have a 2947 // recorded callsite Call. 2948 if (Callee == Clone || !Callee->hasCall()) 2949 continue; 2950 ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge); 2951 removeNoneTypeCalleeEdges(NewClone); 2952 // Moving the edge may have resulted in some none type 2953 // callee edges on the original Callee. 2954 removeNoneTypeCalleeEdges(Callee); 2955 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None); 2956 // If the Callee node was already assigned to call a specific 2957 // function version, make sure its new clone is assigned to call 2958 // that same function clone. 2959 if (CallsiteToCalleeFuncCloneMap.count(Callee)) 2960 RecordCalleeFuncOfCallsite( 2961 NewClone, CallsiteToCalleeFuncCloneMap[Callee]); 2962 // Update NewClone with the new Call clone of this callsite's Call 2963 // created for the new function clone created earlier. 2964 // Recall that we have already ensured when building the graph 2965 // that each caller can only call callsites within the same 2966 // function, so we are guaranteed that Callee Call is in the 2967 // current OrigFunc. 2968 // CallMap is set up as indexed by original Call at clone 0. 2969 CallInfo OrigCall(Callee->getOrigNode()->Call); 2970 OrigCall.setCloneNo(0); 2971 std::map<CallInfo, CallInfo> &CallMap = 2972 FuncClonesToCallMap[NewFuncClone]; 2973 assert(CallMap.count(OrigCall)); 2974 CallInfo NewCall(CallMap[OrigCall]); 2975 assert(NewCall); 2976 NewClone->setCall(NewCall); 2977 } 2978 } 2979 // Fall through to handling below to perform the recording of the 2980 // function for this callsite clone. This enables handling of cases 2981 // where the callers were assigned to different clones of a function. 2982 } 2983 2984 // See if we can use existing function clone. Walk through 2985 // all caller edges to see if any have already been assigned to 2986 // a clone of this callsite's function. If we can use it, do so. If not, 2987 // because that function clone is already assigned to a different clone 2988 // of this callsite, then we need to clone again. 2989 // Basically, this checking is needed to handle the case where different 2990 // caller functions/callsites may need versions of this function 2991 // containing different mixes of callsite clones across the different 2992 // callsites within the function. If that happens, we need to create 2993 // additional function clones to handle the various combinations. 2994 // 2995 // Keep track of any new clones of this callsite created by the 2996 // following loop, as well as any existing clone that we decided to 2997 // assign this clone to. 2998 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap; 2999 FuncInfo FuncCloneAssignedToCurCallsiteClone; 3000 // We need to be able to remove Edge from CallerEdges, so need to adjust 3001 // iterator in the loop. 3002 for (auto EI = Clone->CallerEdges.begin(); 3003 EI != Clone->CallerEdges.end();) { 3004 auto Edge = *EI; 3005 // Ignore any caller that does not have a recorded callsite Call. 3006 if (!Edge->Caller->hasCall()) { 3007 EI++; 3008 continue; 3009 } 3010 // If this caller already assigned to call a version of OrigFunc, need 3011 // to ensure we can assign this callsite clone to that function clone. 3012 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) { 3013 FuncInfo FuncCloneCalledByCaller = 3014 CallsiteToCalleeFuncCloneMap[Edge->Caller]; 3015 // First we need to confirm that this function clone is available 3016 // for use by this callsite node clone. 3017 // 3018 // While FuncCloneToCurNodeCloneMap is built only for this Node and 3019 // its callsite clones, one of those callsite clones X could have 3020 // been assigned to the same function clone called by Edge's caller 3021 // - if Edge's caller calls another callsite within Node's original 3022 // function, and that callsite has another caller reaching clone X. 3023 // We need to clone Node again in this case. 3024 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) && 3025 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] != 3026 Clone) || 3027 // Detect when we have multiple callers of this callsite that 3028 // have already been assigned to specific, and different, clones 3029 // of OrigFunc (due to other unrelated callsites in Func they 3030 // reach via call contexts). Is this Clone of callsite Node 3031 // assigned to a different clone of OrigFunc? If so, clone Node 3032 // again. 3033 (FuncCloneAssignedToCurCallsiteClone && 3034 FuncCloneAssignedToCurCallsiteClone != 3035 FuncCloneCalledByCaller)) { 3036 // We need to use a different newly created callsite clone, in 3037 // order to assign it to another new function clone on a 3038 // subsequent iteration over the Clones array (adjusted below). 3039 // Note we specifically do not reset the 3040 // CallsiteToCalleeFuncCloneMap entry for this caller, so that 3041 // when this new clone is processed later we know which version of 3042 // the function to copy (so that other callsite clones we have 3043 // assigned to that function clone are properly cloned over). See 3044 // comments in the function cloning handling earlier. 3045 3046 // Check if we already have cloned this callsite again while 3047 // walking through caller edges, for a caller calling the same 3048 // function clone. If so, we can move this edge to that new clone 3049 // rather than creating yet another new clone. 3050 if (FuncCloneToNewCallsiteCloneMap.count( 3051 FuncCloneCalledByCaller)) { 3052 ContextNode *NewClone = 3053 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller]; 3054 moveEdgeToExistingCalleeClone(Edge, NewClone, &EI); 3055 // Cleanup any none type edges cloned over. 3056 removeNoneTypeCalleeEdges(NewClone); 3057 } else { 3058 // Create a new callsite clone. 3059 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI); 3060 removeNoneTypeCalleeEdges(NewClone); 3061 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] = 3062 NewClone; 3063 // Add to list of clones and process later. 3064 ClonesWorklist.push_back(NewClone); 3065 assert(EI == Clone->CallerEdges.end() || 3066 Clone->AllocTypes != (uint8_t)AllocationType::None); 3067 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None); 3068 } 3069 // Moving the caller edge may have resulted in some none type 3070 // callee edges. 3071 removeNoneTypeCalleeEdges(Clone); 3072 // We will handle the newly created callsite clone in a subsequent 3073 // iteration over this Node's Clones. Continue here since we 3074 // already adjusted iterator EI while moving the edge. 3075 continue; 3076 } 3077 3078 // Otherwise, we can use the function clone already assigned to this 3079 // caller. 3080 if (!FuncCloneAssignedToCurCallsiteClone) { 3081 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller; 3082 // Assign Clone to FuncCloneCalledByCaller 3083 AssignCallsiteCloneToFuncClone( 3084 FuncCloneCalledByCaller, Call, Clone, 3085 AllocationCallToContextNodeMap.count(Call)); 3086 } else 3087 // Don't need to do anything - callsite is already calling this 3088 // function clone. 3089 assert(FuncCloneAssignedToCurCallsiteClone == 3090 FuncCloneCalledByCaller); 3091 3092 } else { 3093 // We have not already assigned this caller to a version of 3094 // OrigFunc. Do the assignment now. 3095 3096 // First check if we have already assigned this callsite clone to a 3097 // clone of OrigFunc for another caller during this iteration over 3098 // its caller edges. 3099 if (!FuncCloneAssignedToCurCallsiteClone) { 3100 // Find first function in FuncClonesToCallMap without an assigned 3101 // clone of this callsite Node. We should always have one 3102 // available at this point due to the earlier cloning when the 3103 // FuncClonesToCallMap size was smaller than the clone number. 3104 for (auto &CF : FuncClonesToCallMap) { 3105 if (!FuncCloneToCurNodeCloneMap.count(CF.first)) { 3106 FuncCloneAssignedToCurCallsiteClone = CF.first; 3107 break; 3108 } 3109 } 3110 assert(FuncCloneAssignedToCurCallsiteClone); 3111 // Assign Clone to FuncCloneAssignedToCurCallsiteClone 3112 AssignCallsiteCloneToFuncClone( 3113 FuncCloneAssignedToCurCallsiteClone, Call, Clone, 3114 AllocationCallToContextNodeMap.count(Call)); 3115 } else 3116 assert(FuncCloneToCurNodeCloneMap 3117 [FuncCloneAssignedToCurCallsiteClone] == Clone); 3118 // Update callers to record function version called. 3119 RecordCalleeFuncOfCallsite(Edge->Caller, 3120 FuncCloneAssignedToCurCallsiteClone); 3121 } 3122 3123 EI++; 3124 } 3125 } 3126 if (VerifyCCG) { 3127 checkNode<DerivedCCG, FuncTy, CallTy>(Node); 3128 for (const auto &PE : Node->CalleeEdges) 3129 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee); 3130 for (const auto &CE : Node->CallerEdges) 3131 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller); 3132 for (auto *Clone : Node->Clones) { 3133 checkNode<DerivedCCG, FuncTy, CallTy>(Clone); 3134 for (const auto &PE : Clone->CalleeEdges) 3135 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee); 3136 for (const auto &CE : Clone->CallerEdges) 3137 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller); 3138 } 3139 } 3140 } 3141 } 3142 3143 auto UpdateCalls = [&](ContextNode *Node, 3144 DenseSet<const ContextNode *> &Visited, 3145 auto &&UpdateCalls) { 3146 auto Inserted = Visited.insert(Node); 3147 if (!Inserted.second) 3148 return; 3149 3150 for (auto *Clone : Node->Clones) 3151 UpdateCalls(Clone, Visited, UpdateCalls); 3152 3153 for (auto &Edge : Node->CallerEdges) 3154 UpdateCalls(Edge->Caller, Visited, UpdateCalls); 3155 3156 // Skip if either no call to update, or if we ended up with no context ids 3157 // (we moved all edges onto other clones). 3158 if (!Node->hasCall() || Node->ContextIds.empty()) 3159 return; 3160 3161 if (Node->IsAllocation) { 3162 updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes)); 3163 return; 3164 } 3165 3166 if (!CallsiteToCalleeFuncCloneMap.count(Node)) 3167 return; 3168 3169 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node]; 3170 updateCall(Node->Call, CalleeFunc); 3171 }; 3172 3173 // Performs DFS traversal starting from allocation nodes to update calls to 3174 // reflect cloning decisions recorded earlier. For regular LTO this will 3175 // update the actual calls in the IR to call the appropriate function clone 3176 // (and add attributes to allocation calls), whereas for ThinLTO the decisions 3177 // are recorded in the summary entries. 3178 DenseSet<const ContextNode *> Visited; 3179 for (auto &Entry : AllocationCallToContextNodeMap) 3180 UpdateCalls(Entry.second, Visited, UpdateCalls); 3181 3182 return Changed; 3183 } 3184 3185 static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones( 3186 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, 3187 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>> 3188 &FuncToAliasMap) { 3189 // The first "clone" is the original copy, we should only call this if we 3190 // needed to create new clones. 3191 assert(NumClones > 1); 3192 SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps; 3193 VMaps.reserve(NumClones - 1); 3194 FunctionsClonedThinBackend++; 3195 for (unsigned I = 1; I < NumClones; I++) { 3196 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>()); 3197 auto *NewF = CloneFunction(&F, *VMaps.back()); 3198 FunctionClonesThinBackend++; 3199 // Strip memprof and callsite metadata from clone as they are no longer 3200 // needed. 3201 for (auto &BB : *NewF) { 3202 for (auto &Inst : BB) { 3203 Inst.setMetadata(LLVMContext::MD_memprof, nullptr); 3204 Inst.setMetadata(LLVMContext::MD_callsite, nullptr); 3205 } 3206 } 3207 std::string Name = getMemProfFuncName(F.getName(), I); 3208 auto *PrevF = M.getFunction(Name); 3209 if (PrevF) { 3210 // We might have created this when adjusting callsite in another 3211 // function. It should be a declaration. 3212 assert(PrevF->isDeclaration()); 3213 NewF->takeName(PrevF); 3214 PrevF->replaceAllUsesWith(NewF); 3215 PrevF->eraseFromParent(); 3216 } else 3217 NewF->setName(Name); 3218 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F) 3219 << "created clone " << ore::NV("NewFunction", NewF)); 3220 3221 // Now handle aliases to this function, and clone those as well. 3222 if (!FuncToAliasMap.count(&F)) 3223 continue; 3224 for (auto *A : FuncToAliasMap[&F]) { 3225 std::string Name = getMemProfFuncName(A->getName(), I); 3226 auto *PrevA = M.getNamedAlias(Name); 3227 auto *NewA = GlobalAlias::create(A->getValueType(), 3228 A->getType()->getPointerAddressSpace(), 3229 A->getLinkage(), Name, NewF); 3230 NewA->copyAttributesFrom(A); 3231 if (PrevA) { 3232 // We might have created this when adjusting callsite in another 3233 // function. It should be a declaration. 3234 assert(PrevA->isDeclaration()); 3235 NewA->takeName(PrevA); 3236 PrevA->replaceAllUsesWith(NewA); 3237 PrevA->eraseFromParent(); 3238 } 3239 } 3240 } 3241 return VMaps; 3242 } 3243 3244 // Locate the summary for F. This is complicated by the fact that it might 3245 // have been internalized or promoted. 3246 static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, 3247 const ModuleSummaryIndex *ImportSummary) { 3248 // FIXME: Ideally we would retain the original GUID in some fashion on the 3249 // function (e.g. as metadata), but for now do our best to locate the 3250 // summary without that information. 3251 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID()); 3252 if (!TheFnVI) 3253 // See if theFn was internalized, by checking index directly with 3254 // original name (this avoids the name adjustment done by getGUID() for 3255 // internal symbols). 3256 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName())); 3257 if (TheFnVI) 3258 return TheFnVI; 3259 // Now query with the original name before any promotion was performed. 3260 StringRef OrigName = 3261 ModuleSummaryIndex::getOriginalNameBeforePromote(F.getName()); 3262 std::string OrigId = GlobalValue::getGlobalIdentifier( 3263 OrigName, GlobalValue::InternalLinkage, M.getSourceFileName()); 3264 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId)); 3265 if (TheFnVI) 3266 return TheFnVI; 3267 // Could be a promoted local imported from another module. We need to pass 3268 // down more info here to find the original module id. For now, try with 3269 // the OrigName which might have been stored in the OidGuidMap in the 3270 // index. This would not work if there were same-named locals in multiple 3271 // modules, however. 3272 auto OrigGUID = 3273 ImportSummary->getGUIDFromOriginalID(GlobalValue::getGUID(OrigName)); 3274 if (OrigGUID) 3275 TheFnVI = ImportSummary->getValueInfo(OrigGUID); 3276 return TheFnVI; 3277 } 3278 3279 bool MemProfContextDisambiguation::applyImport(Module &M) { 3280 assert(ImportSummary); 3281 bool Changed = false; 3282 3283 auto IsMemProfClone = [](const Function &F) { 3284 return F.getName().contains(MemProfCloneSuffix); 3285 }; 3286 3287 // We also need to clone any aliases that reference cloned functions, because 3288 // the modified callsites may invoke via the alias. Keep track of the aliases 3289 // for each function. 3290 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>> 3291 FuncToAliasMap; 3292 for (auto &A : M.aliases()) { 3293 auto *Aliasee = A.getAliaseeObject(); 3294 if (auto *F = dyn_cast<Function>(Aliasee)) 3295 FuncToAliasMap[F].insert(&A); 3296 } 3297 3298 for (auto &F : M) { 3299 if (F.isDeclaration() || IsMemProfClone(F)) 3300 continue; 3301 3302 OptimizationRemarkEmitter ORE(&F); 3303 3304 SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps; 3305 bool ClonesCreated = false; 3306 unsigned NumClonesCreated = 0; 3307 auto CloneFuncIfNeeded = [&](unsigned NumClones) { 3308 // We should at least have version 0 which is the original copy. 3309 assert(NumClones > 0); 3310 // If only one copy needed use original. 3311 if (NumClones == 1) 3312 return; 3313 // If we already performed cloning of this function, confirm that the 3314 // requested number of clones matches (the thin link should ensure the 3315 // number of clones for each constituent callsite is consistent within 3316 // each function), before returning. 3317 if (ClonesCreated) { 3318 assert(NumClonesCreated == NumClones); 3319 return; 3320 } 3321 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap); 3322 // The first "clone" is the original copy, which doesn't have a VMap. 3323 assert(VMaps.size() == NumClones - 1); 3324 Changed = true; 3325 ClonesCreated = true; 3326 NumClonesCreated = NumClones; 3327 }; 3328 3329 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB, 3330 Function *CalledFunction) { 3331 // Perform cloning if not yet done. 3332 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size()); 3333 3334 // Should have skipped indirect calls via mayHaveMemprofSummary. 3335 assert(CalledFunction); 3336 assert(!IsMemProfClone(*CalledFunction)); 3337 3338 // Update the calls per the summary info. 3339 // Save orig name since it gets updated in the first iteration 3340 // below. 3341 auto CalleeOrigName = CalledFunction->getName(); 3342 for (unsigned J = 0; J < StackNode.Clones.size(); J++) { 3343 // Do nothing if this version calls the original version of its 3344 // callee. 3345 if (!StackNode.Clones[J]) 3346 continue; 3347 auto NewF = M.getOrInsertFunction( 3348 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]), 3349 CalledFunction->getFunctionType()); 3350 CallBase *CBClone; 3351 // Copy 0 is the original function. 3352 if (!J) 3353 CBClone = CB; 3354 else 3355 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]); 3356 CBClone->setCalledFunction(NewF); 3357 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone) 3358 << ore::NV("Call", CBClone) << " in clone " 3359 << ore::NV("Caller", CBClone->getFunction()) 3360 << " assigned to call function clone " 3361 << ore::NV("Callee", NewF.getCallee())); 3362 } 3363 }; 3364 3365 // Locate the summary for F. 3366 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary); 3367 // If not found, this could be an imported local (see comment in 3368 // findValueInfoForFunc). Skip for now as it will be cloned in its original 3369 // module (where it would have been promoted to global scope so should 3370 // satisfy any reference in this module). 3371 if (!TheFnVI) 3372 continue; 3373 3374 auto *GVSummary = 3375 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier()); 3376 if (!GVSummary) 3377 // Must have been imported, use the first summary (might be multiple if 3378 // this was a linkonce_odr). 3379 GVSummary = TheFnVI.getSummaryList().front().get(); 3380 3381 // If this was an imported alias skip it as we won't have the function 3382 // summary, and it should be cloned in the original module. 3383 if (isa<AliasSummary>(GVSummary)) 3384 continue; 3385 3386 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject()); 3387 3388 if (FS->allocs().empty() && FS->callsites().empty()) 3389 continue; 3390 3391 auto SI = FS->callsites().begin(); 3392 auto AI = FS->allocs().begin(); 3393 3394 // To handle callsite infos synthesized for tail calls which have missing 3395 // frames in the profiled context, map callee VI to the synthesized callsite 3396 // info. 3397 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite; 3398 // Iterate the callsites for this function in reverse, since we place all 3399 // those synthesized for tail calls at the end. 3400 for (auto CallsiteIt = FS->callsites().rbegin(); 3401 CallsiteIt != FS->callsites().rend(); CallsiteIt++) { 3402 auto &Callsite = *CallsiteIt; 3403 // Stop as soon as we see a non-synthesized callsite info (see comment 3404 // above loop). All the entries added for discovered tail calls have empty 3405 // stack ids. 3406 if (!Callsite.StackIdIndices.empty()) 3407 break; 3408 MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite}); 3409 } 3410 3411 // Assume for now that the instructions are in the exact same order 3412 // as when the summary was created, but confirm this is correct by 3413 // matching the stack ids. 3414 for (auto &BB : F) { 3415 for (auto &I : BB) { 3416 auto *CB = dyn_cast<CallBase>(&I); 3417 // Same handling as when creating module summary. 3418 if (!mayHaveMemprofSummary(CB)) 3419 continue; 3420 3421 auto *CalledValue = CB->getCalledOperand(); 3422 auto *CalledFunction = CB->getCalledFunction(); 3423 if (CalledValue && !CalledFunction) { 3424 CalledValue = CalledValue->stripPointerCasts(); 3425 // Stripping pointer casts can reveal a called function. 3426 CalledFunction = dyn_cast<Function>(CalledValue); 3427 } 3428 // Check if this is an alias to a function. If so, get the 3429 // called aliasee for the checks below. 3430 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) { 3431 assert(!CalledFunction && 3432 "Expected null called function in callsite for alias"); 3433 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject()); 3434 } 3435 3436 CallStack<MDNode, MDNode::op_iterator> CallsiteContext( 3437 I.getMetadata(LLVMContext::MD_callsite)); 3438 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof); 3439 3440 // Include allocs that were already assigned a memprof function 3441 // attribute in the statistics. 3442 if (CB->getAttributes().hasFnAttr("memprof")) { 3443 assert(!MemProfMD); 3444 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold" 3445 ? AllocTypeColdThinBackend++ 3446 : AllocTypeNotColdThinBackend++; 3447 OrigAllocsThinBackend++; 3448 AllocVersionsThinBackend++; 3449 if (!MaxAllocVersionsThinBackend) 3450 MaxAllocVersionsThinBackend = 1; 3451 // Remove any remaining callsite metadata and we can skip the rest of 3452 // the handling for this instruction, since no cloning needed. 3453 I.setMetadata(LLVMContext::MD_callsite, nullptr); 3454 continue; 3455 } 3456 3457 if (MemProfMD) { 3458 // Consult the next alloc node. 3459 assert(AI != FS->allocs().end()); 3460 auto &AllocNode = *(AI++); 3461 3462 // Sanity check that the MIB stack ids match between the summary and 3463 // instruction metadata. 3464 auto MIBIter = AllocNode.MIBs.begin(); 3465 for (auto &MDOp : MemProfMD->operands()) { 3466 assert(MIBIter != AllocNode.MIBs.end()); 3467 LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter = 3468 MIBIter->StackIdIndices.begin(); 3469 auto *MIBMD = cast<const MDNode>(MDOp); 3470 MDNode *StackMDNode = getMIBStackNode(MIBMD); 3471 assert(StackMDNode); 3472 SmallVector<unsigned> StackIdsFromMetadata; 3473 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode); 3474 for (auto ContextIter = 3475 StackContext.beginAfterSharedPrefix(CallsiteContext); 3476 ContextIter != StackContext.end(); ++ContextIter) { 3477 // If this is a direct recursion, simply skip the duplicate 3478 // entries, to be consistent with how the summary ids were 3479 // generated during ModuleSummaryAnalysis. 3480 if (!StackIdsFromMetadata.empty() && 3481 StackIdsFromMetadata.back() == *ContextIter) 3482 continue; 3483 assert(StackIdIndexIter != MIBIter->StackIdIndices.end()); 3484 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) == 3485 *ContextIter); 3486 StackIdIndexIter++; 3487 } 3488 MIBIter++; 3489 } 3490 3491 // Perform cloning if not yet done. 3492 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size()); 3493 3494 OrigAllocsThinBackend++; 3495 AllocVersionsThinBackend += AllocNode.Versions.size(); 3496 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size()) 3497 MaxAllocVersionsThinBackend = AllocNode.Versions.size(); 3498 3499 // If there is only one version that means we didn't end up 3500 // considering this function for cloning, and in that case the alloc 3501 // will still be none type or should have gotten the default NotCold. 3502 // Skip that after calling clone helper since that does some sanity 3503 // checks that confirm we haven't decided yet that we need cloning. 3504 if (AllocNode.Versions.size() == 1) { 3505 assert((AllocationType)AllocNode.Versions[0] == 3506 AllocationType::NotCold || 3507 (AllocationType)AllocNode.Versions[0] == 3508 AllocationType::None); 3509 UnclonableAllocsThinBackend++; 3510 continue; 3511 } 3512 3513 // All versions should have a singular allocation type. 3514 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) { 3515 return Type == ((uint8_t)AllocationType::NotCold | 3516 (uint8_t)AllocationType::Cold); 3517 })); 3518 3519 // Update the allocation types per the summary info. 3520 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) { 3521 // Ignore any that didn't get an assigned allocation type. 3522 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None) 3523 continue; 3524 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J]; 3525 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++ 3526 : AllocTypeNotColdThinBackend++; 3527 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy); 3528 auto A = llvm::Attribute::get(F.getContext(), "memprof", 3529 AllocTypeString); 3530 CallBase *CBClone; 3531 // Copy 0 is the original function. 3532 if (!J) 3533 CBClone = CB; 3534 else 3535 // Since VMaps are only created for new clones, we index with 3536 // clone J-1 (J==0 is the original clone and does not have a VMaps 3537 // entry). 3538 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]); 3539 CBClone->addFnAttr(A); 3540 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone) 3541 << ore::NV("AllocationCall", CBClone) << " in clone " 3542 << ore::NV("Caller", CBClone->getFunction()) 3543 << " marked with memprof allocation attribute " 3544 << ore::NV("Attribute", AllocTypeString)); 3545 } 3546 } else if (!CallsiteContext.empty()) { 3547 // Consult the next callsite node. 3548 assert(SI != FS->callsites().end()); 3549 auto &StackNode = *(SI++); 3550 3551 #ifndef NDEBUG 3552 // Sanity check that the stack ids match between the summary and 3553 // instruction metadata. 3554 auto StackIdIndexIter = StackNode.StackIdIndices.begin(); 3555 for (auto StackId : CallsiteContext) { 3556 assert(StackIdIndexIter != StackNode.StackIdIndices.end()); 3557 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) == 3558 StackId); 3559 StackIdIndexIter++; 3560 } 3561 #endif 3562 3563 CloneCallsite(StackNode, CB, CalledFunction); 3564 } else if (CB->isTailCall()) { 3565 // Locate the synthesized callsite info for the callee VI, if any was 3566 // created, and use that for cloning. 3567 ValueInfo CalleeVI = 3568 findValueInfoForFunc(*CalledFunction, M, ImportSummary); 3569 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) { 3570 auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI); 3571 assert(Callsite != MapTailCallCalleeVIToCallsite.end()); 3572 CloneCallsite(Callsite->second, CB, CalledFunction); 3573 } 3574 } 3575 // Memprof and callsite metadata on memory allocations no longer needed. 3576 I.setMetadata(LLVMContext::MD_memprof, nullptr); 3577 I.setMetadata(LLVMContext::MD_callsite, nullptr); 3578 } 3579 } 3580 } 3581 3582 return Changed; 3583 } 3584 3585 template <typename DerivedCCG, typename FuncTy, typename CallTy> 3586 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() { 3587 if (DumpCCG) { 3588 dbgs() << "CCG before cloning:\n"; 3589 dbgs() << *this; 3590 } 3591 if (ExportToDot) 3592 exportToDot("postbuild"); 3593 3594 if (VerifyCCG) { 3595 check(); 3596 } 3597 3598 identifyClones(); 3599 3600 if (VerifyCCG) { 3601 check(); 3602 } 3603 3604 if (DumpCCG) { 3605 dbgs() << "CCG after cloning:\n"; 3606 dbgs() << *this; 3607 } 3608 if (ExportToDot) 3609 exportToDot("cloned"); 3610 3611 bool Changed = assignFunctions(); 3612 3613 if (DumpCCG) { 3614 dbgs() << "CCG after assigning function clones:\n"; 3615 dbgs() << *this; 3616 } 3617 if (ExportToDot) 3618 exportToDot("clonefuncassign"); 3619 3620 return Changed; 3621 } 3622 3623 bool MemProfContextDisambiguation::processModule( 3624 Module &M, 3625 function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) { 3626 3627 // If we have an import summary, then the cloning decisions were made during 3628 // the thin link on the index. Apply them and return. 3629 if (ImportSummary) 3630 return applyImport(M); 3631 3632 // TODO: If/when other types of memprof cloning are enabled beyond just for 3633 // hot and cold, we will need to change this to individually control the 3634 // AllocationType passed to addStackNodesForMIB during CCG construction. 3635 // Note that we specifically check this after applying imports above, so that 3636 // the option isn't needed to be passed to distributed ThinLTO backend 3637 // clang processes, which won't necessarily have visibility into the linker 3638 // dependences. Instead the information is communicated from the LTO link to 3639 // the backends via the combined summary index. 3640 if (!SupportsHotColdNew) 3641 return false; 3642 3643 ModuleCallsiteContextGraph CCG(M, OREGetter); 3644 return CCG.process(); 3645 } 3646 3647 MemProfContextDisambiguation::MemProfContextDisambiguation( 3648 const ModuleSummaryIndex *Summary) 3649 : ImportSummary(Summary) { 3650 if (ImportSummary) { 3651 // The MemProfImportSummary should only be used for testing ThinLTO 3652 // distributed backend handling via opt, in which case we don't have a 3653 // summary from the pass pipeline. 3654 assert(MemProfImportSummary.empty()); 3655 return; 3656 } 3657 if (MemProfImportSummary.empty()) 3658 return; 3659 3660 auto ReadSummaryFile = 3661 errorOrToExpected(MemoryBuffer::getFile(MemProfImportSummary)); 3662 if (!ReadSummaryFile) { 3663 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(), 3664 "Error loading file '" + MemProfImportSummary + 3665 "': "); 3666 return; 3667 } 3668 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile); 3669 if (!ImportSummaryForTestingOrErr) { 3670 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(), 3671 "Error parsing file '" + MemProfImportSummary + 3672 "': "); 3673 return; 3674 } 3675 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr); 3676 ImportSummary = ImportSummaryForTesting.get(); 3677 } 3678 3679 PreservedAnalyses MemProfContextDisambiguation::run(Module &M, 3680 ModuleAnalysisManager &AM) { 3681 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 3682 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & { 3683 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F); 3684 }; 3685 if (!processModule(M, OREGetter)) 3686 return PreservedAnalyses::all(); 3687 return PreservedAnalyses::none(); 3688 } 3689 3690 void MemProfContextDisambiguation::run( 3691 ModuleSummaryIndex &Index, 3692 function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> 3693 isPrevailing) { 3694 // TODO: If/when other types of memprof cloning are enabled beyond just for 3695 // hot and cold, we will need to change this to individually control the 3696 // AllocationType passed to addStackNodesForMIB during CCG construction. 3697 // The index was set from the option, so these should be in sync. 3698 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew); 3699 if (!SupportsHotColdNew) 3700 return; 3701 3702 IndexCallsiteContextGraph CCG(Index, isPrevailing); 3703 CCG.process(); 3704 } 3705