xref: /freebsd-src/contrib/llvm-project/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp (revision 46c59ea9b61755455ff6bf9f3e7b834e1af634ea)
1 //==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements support for context disambiguation of allocation
10 // calls for profile guided heap optimization. Specifically, it uses Memprof
11 // profiles which indicate context specific allocation behavior (currently
12 // distinguishing cold vs hot memory allocations). Cloning is performed to
13 // expose the cold allocation call contexts, and the allocation calls are
14 // subsequently annotated with an attribute for later transformation.
15 //
16 // The transformations can be performed either directly on IR (regular LTO), or
17 // on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18 // Both types of LTO operate on a the same base graph representation, which
19 // uses CRTP to support either IR or Index formats.
20 //
21 //===----------------------------------------------------------------------===//
22 
23 #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
24 #include "llvm/ADT/DenseMap.h"
25 #include "llvm/ADT/DenseSet.h"
26 #include "llvm/ADT/MapVector.h"
27 #include "llvm/ADT/SetOperations.h"
28 #include "llvm/ADT/SmallPtrSet.h"
29 #include "llvm/ADT/SmallSet.h"
30 #include "llvm/ADT/SmallVector.h"
31 #include "llvm/ADT/Statistic.h"
32 #include "llvm/Analysis/MemoryProfileInfo.h"
33 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
34 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
35 #include "llvm/Bitcode/BitcodeReader.h"
36 #include "llvm/IR/Constants.h"
37 #include "llvm/IR/Instructions.h"
38 #include "llvm/IR/Module.h"
39 #include "llvm/IR/ModuleSummaryIndex.h"
40 #include "llvm/Pass.h"
41 #include "llvm/Support/CommandLine.h"
42 #include "llvm/Support/FileSystem.h"
43 #include "llvm/Support/GraphWriter.h"
44 #include "llvm/Support/raw_ostream.h"
45 #include "llvm/Transforms/IPO.h"
46 #include "llvm/Transforms/Utils/Cloning.h"
47 #include <sstream>
48 #include <vector>
49 using namespace llvm;
50 using namespace llvm::memprof;
51 
52 #define DEBUG_TYPE "memprof-context-disambiguation"
53 
54 STATISTIC(FunctionClonesAnalysis,
55           "Number of function clones created during whole program analysis");
56 STATISTIC(FunctionClonesThinBackend,
57           "Number of function clones created during ThinLTO backend");
58 STATISTIC(FunctionsClonedThinBackend,
59           "Number of functions that had clones created during ThinLTO backend");
60 STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
61                             "cloned) during whole program analysis");
62 STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
63                          "during whole program analysis");
64 STATISTIC(AllocTypeNotColdThinBackend,
65           "Number of not cold static allocations (possibly cloned) during "
66           "ThinLTO backend");
67 STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
68                                     "(possibly cloned) during ThinLTO backend");
69 STATISTIC(OrigAllocsThinBackend,
70           "Number of original (not cloned) allocations with memprof profiles "
71           "during ThinLTO backend");
72 STATISTIC(
73     AllocVersionsThinBackend,
74     "Number of allocation versions (including clones) during ThinLTO backend");
75 STATISTIC(MaxAllocVersionsThinBackend,
76           "Maximum number of allocation versions created for an original "
77           "allocation during ThinLTO backend");
78 STATISTIC(UnclonableAllocsThinBackend,
79           "Number of unclonable ambigous allocations during ThinLTO backend");
80 STATISTIC(RemovedEdgesWithMismatchedCallees,
81           "Number of edges removed due to mismatched callees (profiled vs IR)");
82 STATISTIC(FoundProfiledCalleeCount,
83           "Number of profiled callees found via tail calls");
84 STATISTIC(FoundProfiledCalleeDepth,
85           "Aggregate depth of profiled callees found via tail calls");
86 STATISTIC(FoundProfiledCalleeMaxDepth,
87           "Maximum depth of profiled callees found via tail calls");
88 STATISTIC(FoundProfiledCalleeNonUniquelyCount,
89           "Number of profiled callees found via multiple tail call chains");
90 
91 static cl::opt<std::string> DotFilePathPrefix(
92     "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
93     cl::value_desc("filename"),
94     cl::desc("Specify the path prefix of the MemProf dot files."));
95 
96 static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
97                                  cl::Hidden,
98                                  cl::desc("Export graph to dot files."));
99 
100 static cl::opt<bool>
101     DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
102             cl::desc("Dump CallingContextGraph to stdout after each stage."));
103 
104 static cl::opt<bool>
105     VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
106               cl::desc("Perform verification checks on CallingContextGraph."));
107 
108 static cl::opt<bool>
109     VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
110                 cl::desc("Perform frequent verification checks on nodes."));
111 
112 static cl::opt<std::string> MemProfImportSummary(
113     "memprof-import-summary",
114     cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
115     cl::Hidden);
116 
117 static cl::opt<unsigned>
118     TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
119                         cl::Hidden,
120                         cl::desc("Max depth to recursively search for missing "
121                                  "frames through tail calls."));
122 
123 namespace llvm {
124 // Indicate we are linking with an allocator that supports hot/cold operator
125 // new interfaces.
126 cl::opt<bool> SupportsHotColdNew(
127     "supports-hot-cold-new", cl::init(false), cl::Hidden,
128     cl::desc("Linking with hot/cold operator new interfaces"));
129 } // namespace llvm
130 
131 namespace {
132 /// CRTP base for graphs built from either IR or ThinLTO summary index.
133 ///
134 /// The graph represents the call contexts in all memprof metadata on allocation
135 /// calls, with nodes for the allocations themselves, as well as for the calls
136 /// in each context. The graph is initially built from the allocation memprof
137 /// metadata (or summary) MIBs. It is then updated to match calls with callsite
138 /// metadata onto the nodes, updating it to reflect any inlining performed on
139 /// those calls.
140 ///
141 /// Each MIB (representing an allocation's call context with allocation
142 /// behavior) is assigned a unique context id during the graph build. The edges
143 /// and nodes in the graph are decorated with the context ids they carry. This
144 /// is used to correctly update the graph when cloning is performed so that we
145 /// can uniquify the context for a single (possibly cloned) allocation.
146 template <typename DerivedCCG, typename FuncTy, typename CallTy>
147 class CallsiteContextGraph {
148 public:
149   CallsiteContextGraph() = default;
150   CallsiteContextGraph(const CallsiteContextGraph &) = default;
151   CallsiteContextGraph(CallsiteContextGraph &&) = default;
152 
153   /// Main entry point to perform analysis and transformations on graph.
154   bool process();
155 
156   /// Perform cloning on the graph necessary to uniquely identify the allocation
157   /// behavior of an allocation based on its context.
158   void identifyClones();
159 
160   /// Assign callsite clones to functions, cloning functions as needed to
161   /// accommodate the combinations of their callsite clones reached by callers.
162   /// For regular LTO this clones functions and callsites in the IR, but for
163   /// ThinLTO the cloning decisions are noted in the summaries and later applied
164   /// in applyImport.
165   bool assignFunctions();
166 
167   void dump() const;
168   void print(raw_ostream &OS) const;
169 
170   friend raw_ostream &operator<<(raw_ostream &OS,
171                                  const CallsiteContextGraph &CCG) {
172     CCG.print(OS);
173     return OS;
174   }
175 
176   friend struct GraphTraits<
177       const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
178   friend struct DOTGraphTraits<
179       const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
180 
181   void exportToDot(std::string Label) const;
182 
183   /// Represents a function clone via FuncTy pointer and clone number pair.
184   struct FuncInfo final
185       : public std::pair<FuncTy *, unsigned /*Clone number*/> {
186     using Base = std::pair<FuncTy *, unsigned>;
187     FuncInfo(const Base &B) : Base(B) {}
188     FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
189     explicit operator bool() const { return this->first != nullptr; }
190     FuncTy *func() const { return this->first; }
191     unsigned cloneNo() const { return this->second; }
192   };
193 
194   /// Represents a callsite clone via CallTy and clone number pair.
195   struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
196     using Base = std::pair<CallTy, unsigned>;
197     CallInfo(const Base &B) : Base(B) {}
198     CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
199         : Base(Call, CloneNo) {}
200     explicit operator bool() const { return (bool)this->first; }
201     CallTy call() const { return this->first; }
202     unsigned cloneNo() const { return this->second; }
203     void setCloneNo(unsigned N) { this->second = N; }
204     void print(raw_ostream &OS) const {
205       if (!operator bool()) {
206         assert(!cloneNo());
207         OS << "null Call";
208         return;
209       }
210       call()->print(OS);
211       OS << "\t(clone " << cloneNo() << ")";
212     }
213     void dump() const {
214       print(dbgs());
215       dbgs() << "\n";
216     }
217     friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
218       Call.print(OS);
219       return OS;
220     }
221   };
222 
223   struct ContextEdge;
224 
225   /// Node in the Callsite Context Graph
226   struct ContextNode {
227     // Keep this for now since in the IR case where we have an Instruction* it
228     // is not as immediately discoverable. Used for printing richer information
229     // when dumping graph.
230     bool IsAllocation;
231 
232     // Keeps track of when the Call was reset to null because there was
233     // recursion.
234     bool Recursive = false;
235 
236     // The corresponding allocation or interior call.
237     CallInfo Call;
238 
239     // For alloc nodes this is a unique id assigned when constructed, and for
240     // callsite stack nodes it is the original stack id when the node is
241     // constructed from the memprof MIB metadata on the alloc nodes. Note that
242     // this is only used when matching callsite metadata onto the stack nodes
243     // created when processing the allocation memprof MIBs, and for labeling
244     // nodes in the dot graph. Therefore we don't bother to assign a value for
245     // clones.
246     uint64_t OrigStackOrAllocId = 0;
247 
248     // This will be formed by ORing together the AllocationType enum values
249     // for contexts including this node.
250     uint8_t AllocTypes = 0;
251 
252     // Edges to all callees in the profiled call stacks.
253     // TODO: Should this be a map (from Callee node) for more efficient lookup?
254     std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
255 
256     // Edges to all callers in the profiled call stacks.
257     // TODO: Should this be a map (from Caller node) for more efficient lookup?
258     std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
259 
260     // The set of IDs for contexts including this node.
261     DenseSet<uint32_t> ContextIds;
262 
263     // List of clones of this ContextNode, initially empty.
264     std::vector<ContextNode *> Clones;
265 
266     // If a clone, points to the original uncloned node.
267     ContextNode *CloneOf = nullptr;
268 
269     ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
270 
271     ContextNode(bool IsAllocation, CallInfo C)
272         : IsAllocation(IsAllocation), Call(C) {}
273 
274     void addClone(ContextNode *Clone) {
275       if (CloneOf) {
276         CloneOf->Clones.push_back(Clone);
277         Clone->CloneOf = CloneOf;
278       } else {
279         Clones.push_back(Clone);
280         assert(!Clone->CloneOf);
281         Clone->CloneOf = this;
282       }
283     }
284 
285     ContextNode *getOrigNode() {
286       if (!CloneOf)
287         return this;
288       return CloneOf;
289     }
290 
291     void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
292                                unsigned int ContextId);
293 
294     ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
295     ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
296     void eraseCalleeEdge(const ContextEdge *Edge);
297     void eraseCallerEdge(const ContextEdge *Edge);
298 
299     void setCall(CallInfo C) { Call = C; }
300 
301     bool hasCall() const { return (bool)Call.call(); }
302 
303     void printCall(raw_ostream &OS) const { Call.print(OS); }
304 
305     // True if this node was effectively removed from the graph, in which case
306     // its context id set, caller edges, and callee edges should all be empty.
307     bool isRemoved() const {
308       assert(ContextIds.empty() ==
309              (CalleeEdges.empty() && CallerEdges.empty()));
310       return ContextIds.empty();
311     }
312 
313     void dump() const;
314     void print(raw_ostream &OS) const;
315 
316     friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
317       Node.print(OS);
318       return OS;
319     }
320   };
321 
322   /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
323   /// callee.
324   struct ContextEdge {
325     ContextNode *Callee;
326     ContextNode *Caller;
327 
328     // This will be formed by ORing together the AllocationType enum values
329     // for contexts including this edge.
330     uint8_t AllocTypes = 0;
331 
332     // The set of IDs for contexts including this edge.
333     DenseSet<uint32_t> ContextIds;
334 
335     ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
336                 DenseSet<uint32_t> ContextIds)
337         : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
338           ContextIds(ContextIds) {}
339 
340     DenseSet<uint32_t> &getContextIds() { return ContextIds; }
341 
342     void dump() const;
343     void print(raw_ostream &OS) const;
344 
345     friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
346       Edge.print(OS);
347       return OS;
348     }
349   };
350 
351   /// Helper to remove callee edges that have allocation type None (due to not
352   /// carrying any context ids) after transformations.
353   void removeNoneTypeCalleeEdges(ContextNode *Node);
354 
355 protected:
356   /// Get a list of nodes corresponding to the stack ids in the given callsite
357   /// context.
358   template <class NodeT, class IteratorT>
359   std::vector<uint64_t>
360   getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
361 
362   /// Adds nodes for the given allocation and any stack ids on its memprof MIB
363   /// metadata (or summary).
364   ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
365 
366   /// Adds nodes for the given MIB stack ids.
367   template <class NodeT, class IteratorT>
368   void addStackNodesForMIB(ContextNode *AllocNode,
369                            CallStack<NodeT, IteratorT> &StackContext,
370                            CallStack<NodeT, IteratorT> &CallsiteContext,
371                            AllocationType AllocType);
372 
373   /// Matches all callsite metadata (or summary) to the nodes created for
374   /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
375   /// inlining performed on those callsite instructions.
376   void updateStackNodes();
377 
378   /// Update graph to conservatively handle any callsite stack nodes that target
379   /// multiple different callee target functions.
380   void handleCallsitesWithMultipleTargets();
381 
382   /// Save lists of calls with MemProf metadata in each function, for faster
383   /// iteration.
384   MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
385 
386   /// Map from callsite node to the enclosing caller function.
387   std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
388 
389 private:
390   using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
391 
392   using CallContextInfo = std::tuple<CallTy, std::vector<uint64_t>,
393                                      const FuncTy *, DenseSet<uint32_t>>;
394 
395   /// Assigns the given Node to calls at or inlined into the location with
396   /// the Node's stack id, after post order traversing and processing its
397   /// caller nodes. Uses the call information recorded in the given
398   /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
399   /// as needed. Called by updateStackNodes which sets up the given
400   /// StackIdToMatchingCalls map.
401   void assignStackNodesPostOrder(
402       ContextNode *Node, DenseSet<const ContextNode *> &Visited,
403       DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls);
404 
405   /// Duplicates the given set of context ids, updating the provided
406   /// map from each original id with the newly generated context ids,
407   /// and returning the new duplicated id set.
408   DenseSet<uint32_t> duplicateContextIds(
409       const DenseSet<uint32_t> &StackSequenceContextIds,
410       DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
411 
412   /// Propagates all duplicated context ids across the graph.
413   void propagateDuplicateContextIds(
414       const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
415 
416   /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
417   /// else to its callers. Also updates OrigNode's edges to remove any context
418   /// ids moved to the newly created edge.
419   void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
420                       bool TowardsCallee);
421 
422   /// Get the stack id corresponding to the given Id or Index (for IR this will
423   /// return itself, for a summary index this will return the id recorded in the
424   /// index for that stack id index value).
425   uint64_t getStackId(uint64_t IdOrIndex) const {
426     return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
427   }
428 
429   /// Returns true if the given call targets the callee of the given edge, or if
430   /// we were able to identify the call chain through intermediate tail calls.
431   /// In the latter case new context nodes are added to the graph for the
432   /// identified tail calls, and their synthesized nodes are added to
433   /// TailCallToContextNodeMap. The EdgeIter is updated in either case to the
434   /// next element after the input position (either incremented or updated after
435   /// removing the old edge).
436   bool
437   calleesMatch(CallTy Call, EdgeIter &EI,
438                MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
439 
440   /// Returns true if the given call targets the given function, or if we were
441   /// able to identify the call chain through intermediate tail calls (in which
442   /// case FoundCalleeChain will be populated).
443   bool calleeMatchesFunc(
444       CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
445       std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
446     return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
447         Call, Func, CallerFunc, FoundCalleeChain);
448   }
449 
450   /// Get a list of nodes corresponding to the stack ids in the given
451   /// callsite's context.
452   std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
453     return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
454         Call);
455   }
456 
457   /// Get the last stack id in the context for callsite.
458   uint64_t getLastStackId(CallTy Call) {
459     return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
460   }
461 
462   /// Update the allocation call to record type of allocated memory.
463   void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
464     AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
465     static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
466   }
467 
468   /// Update non-allocation call to invoke (possibly cloned) function
469   /// CalleeFunc.
470   void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
471     static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
472   }
473 
474   /// Clone the given function for the given callsite, recording mapping of all
475   /// of the functions tracked calls to their new versions in the CallMap.
476   /// Assigns new clones to clone number CloneNo.
477   FuncInfo cloneFunctionForCallsite(
478       FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
479       std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
480     return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
481         Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
482   }
483 
484   /// Gets a label to use in the dot graph for the given call clone in the given
485   /// function.
486   std::string getLabel(const FuncTy *Func, const CallTy Call,
487                        unsigned CloneNo) const {
488     return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
489   }
490 
491   /// Helpers to find the node corresponding to the given call or stackid.
492   ContextNode *getNodeForInst(const CallInfo &C);
493   ContextNode *getNodeForAlloc(const CallInfo &C);
494   ContextNode *getNodeForStackId(uint64_t StackId);
495 
496   /// Removes the node information recorded for the given call.
497   void unsetNodeForInst(const CallInfo &C);
498 
499   /// Computes the alloc type corresponding to the given context ids, by
500   /// unioning their recorded alloc types.
501   uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds);
502 
503   /// Returns the alloction type of the intersection of the contexts of two
504   /// nodes (based on their provided context id sets), optimized for the case
505   /// when Node1Ids is smaller than Node2Ids.
506   uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
507                                   const DenseSet<uint32_t> &Node2Ids);
508 
509   /// Returns the alloction type of the intersection of the contexts of two
510   /// nodes (based on their provided context id sets).
511   uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
512                               const DenseSet<uint32_t> &Node2Ids);
513 
514   /// Create a clone of Edge's callee and move Edge to that new callee node,
515   /// performing the necessary context id and allocation type updates.
516   /// If callee's caller edge iterator is supplied, it is updated when removing
517   /// the edge from that list.
518   ContextNode *
519   moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
520                            EdgeIter *CallerEdgeI = nullptr);
521 
522   /// Change the callee of Edge to existing callee clone NewCallee, performing
523   /// the necessary context id and allocation type updates.
524   /// If callee's caller edge iterator is supplied, it is updated when removing
525   /// the edge from that list.
526   void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
527                                      ContextNode *NewCallee,
528                                      EdgeIter *CallerEdgeI = nullptr,
529                                      bool NewClone = false);
530 
531   /// Recursively perform cloning on the graph for the given Node and its
532   /// callers, in order to uniquely identify the allocation behavior of an
533   /// allocation given its context.
534   void identifyClones(ContextNode *Node,
535                       DenseSet<const ContextNode *> &Visited);
536 
537   /// Map from each context ID to the AllocationType assigned to that context.
538   std::map<uint32_t, AllocationType> ContextIdToAllocationType;
539 
540   /// Identifies the context node created for a stack id when adding the MIB
541   /// contexts to the graph. This is used to locate the context nodes when
542   /// trying to assign the corresponding callsites with those stack ids to these
543   /// nodes.
544   std::map<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
545 
546   /// Maps to track the calls to their corresponding nodes in the graph.
547   MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
548   MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
549 
550   /// Owner of all ContextNode unique_ptrs.
551   std::vector<std::unique_ptr<ContextNode>> NodeOwner;
552 
553   /// Perform sanity checks on graph when requested.
554   void check() const;
555 
556   /// Keeps track of the last unique context id assigned.
557   unsigned int LastContextId = 0;
558 };
559 
560 template <typename DerivedCCG, typename FuncTy, typename CallTy>
561 using ContextNode =
562     typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
563 template <typename DerivedCCG, typename FuncTy, typename CallTy>
564 using ContextEdge =
565     typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
566 template <typename DerivedCCG, typename FuncTy, typename CallTy>
567 using FuncInfo =
568     typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
569 template <typename DerivedCCG, typename FuncTy, typename CallTy>
570 using CallInfo =
571     typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
572 
573 /// CRTP derived class for graphs built from IR (regular LTO).
574 class ModuleCallsiteContextGraph
575     : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
576                                   Instruction *> {
577 public:
578   ModuleCallsiteContextGraph(
579       Module &M,
580       function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
581 
582 private:
583   friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
584                               Instruction *>;
585 
586   uint64_t getStackId(uint64_t IdOrIndex) const;
587   bool calleeMatchesFunc(
588       Instruction *Call, const Function *Func, const Function *CallerFunc,
589       std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
590   bool findProfiledCalleeThroughTailCalls(
591       const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
592       std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
593       bool &FoundMultipleCalleeChains);
594   uint64_t getLastStackId(Instruction *Call);
595   std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
596   void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
597   void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
598   CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
599                        Instruction *>::FuncInfo
600   cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
601                            std::map<CallInfo, CallInfo> &CallMap,
602                            std::vector<CallInfo> &CallsWithMetadataInFunc,
603                            unsigned CloneNo);
604   std::string getLabel(const Function *Func, const Instruction *Call,
605                        unsigned CloneNo) const;
606 
607   const Module &Mod;
608   function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
609 };
610 
611 /// Represents a call in the summary index graph, which can either be an
612 /// allocation or an interior callsite node in an allocation's context.
613 /// Holds a pointer to the corresponding data structure in the index.
614 struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
615   IndexCall() : PointerUnion() {}
616   IndexCall(std::nullptr_t) : IndexCall() {}
617   IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
618   IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
619   IndexCall(PointerUnion PT) : PointerUnion(PT) {}
620 
621   IndexCall *operator->() { return this; }
622 
623   PointerUnion<CallsiteInfo *, AllocInfo *> getBase() const { return *this; }
624 
625   void print(raw_ostream &OS) const {
626     if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(getBase())) {
627       OS << *AI;
628     } else {
629       auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(getBase());
630       assert(CI);
631       OS << *CI;
632     }
633   }
634 };
635 
636 /// CRTP derived class for graphs built from summary index (ThinLTO).
637 class IndexCallsiteContextGraph
638     : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
639                                   IndexCall> {
640 public:
641   IndexCallsiteContextGraph(
642       ModuleSummaryIndex &Index,
643       function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
644           isPrevailing);
645 
646   ~IndexCallsiteContextGraph() {
647     // Now that we are done with the graph it is safe to add the new
648     // CallsiteInfo structs to the function summary vectors. The graph nodes
649     // point into locations within these vectors, so we don't want to add them
650     // any earlier.
651     for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
652       auto *FS = I.first;
653       for (auto &Callsite : I.second)
654         FS->addCallsite(*Callsite.second);
655     }
656   }
657 
658 private:
659   friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
660                               IndexCall>;
661 
662   uint64_t getStackId(uint64_t IdOrIndex) const;
663   bool calleeMatchesFunc(
664       IndexCall &Call, const FunctionSummary *Func,
665       const FunctionSummary *CallerFunc,
666       std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
667   bool findProfiledCalleeThroughTailCalls(
668       ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
669       std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
670       bool &FoundMultipleCalleeChains);
671   uint64_t getLastStackId(IndexCall &Call);
672   std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
673   void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
674   void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
675   CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
676                        IndexCall>::FuncInfo
677   cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
678                            std::map<CallInfo, CallInfo> &CallMap,
679                            std::vector<CallInfo> &CallsWithMetadataInFunc,
680                            unsigned CloneNo);
681   std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
682                        unsigned CloneNo) const;
683 
684   // Saves mapping from function summaries containing memprof records back to
685   // its VI, for use in checking and debugging.
686   std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
687 
688   const ModuleSummaryIndex &Index;
689   function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
690       isPrevailing;
691 
692   // Saves/owns the callsite info structures synthesized for missing tail call
693   // frames that we discover while building the graph.
694   // It maps from the summary of the function making the tail call, to a map
695   // of callee ValueInfo to corresponding synthesized callsite info.
696   std::unordered_map<FunctionSummary *,
697                      std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
698       FunctionCalleesToSynthesizedCallsiteInfos;
699 };
700 } // namespace
701 
702 namespace llvm {
703 template <>
704 struct DenseMapInfo<typename CallsiteContextGraph<
705     ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
706     : public DenseMapInfo<std::pair<Instruction *, unsigned>> {};
707 template <>
708 struct DenseMapInfo<typename CallsiteContextGraph<
709     IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
710     : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
711 template <>
712 struct DenseMapInfo<IndexCall>
713     : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
714 } // end namespace llvm
715 
716 namespace {
717 
718 struct FieldSeparator {
719   bool Skip = true;
720   const char *Sep;
721 
722   FieldSeparator(const char *Sep = ", ") : Sep(Sep) {}
723 };
724 
725 raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) {
726   if (FS.Skip) {
727     FS.Skip = false;
728     return OS;
729   }
730   return OS << FS.Sep;
731 }
732 
733 // Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
734 // type we should actually use on the corresponding allocation.
735 // If we can't clone a node that has NotCold+Cold alloc type, we will fall
736 // back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
737 // from NotCold.
738 AllocationType allocTypeToUse(uint8_t AllocTypes) {
739   assert(AllocTypes != (uint8_t)AllocationType::None);
740   if (AllocTypes ==
741       ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
742     return AllocationType::NotCold;
743   else
744     return (AllocationType)AllocTypes;
745 }
746 
747 // Helper to check if the alloc types for all edges recorded in the
748 // InAllocTypes vector match the alloc types for all edges in the Edges
749 // vector.
750 template <typename DerivedCCG, typename FuncTy, typename CallTy>
751 bool allocTypesMatch(
752     const std::vector<uint8_t> &InAllocTypes,
753     const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
754         &Edges) {
755   return std::equal(
756       InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(),
757       [](const uint8_t &l,
758          const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
759         // Can share if one of the edges is None type - don't
760         // care about the type along that edge as it doesn't
761         // exist for those context ids.
762         if (l == (uint8_t)AllocationType::None ||
763             r->AllocTypes == (uint8_t)AllocationType::None)
764           return true;
765         return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
766       });
767 }
768 
769 } // end anonymous namespace
770 
771 template <typename DerivedCCG, typename FuncTy, typename CallTy>
772 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
773 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
774     const CallInfo &C) {
775   ContextNode *Node = getNodeForAlloc(C);
776   if (Node)
777     return Node;
778 
779   return NonAllocationCallToContextNodeMap.lookup(C);
780 }
781 
782 template <typename DerivedCCG, typename FuncTy, typename CallTy>
783 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
784 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
785     const CallInfo &C) {
786   return AllocationCallToContextNodeMap.lookup(C);
787 }
788 
789 template <typename DerivedCCG, typename FuncTy, typename CallTy>
790 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
791 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
792     uint64_t StackId) {
793   auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
794   if (StackEntryNode != StackEntryIdToContextNodeMap.end())
795     return StackEntryNode->second;
796   return nullptr;
797 }
798 
799 template <typename DerivedCCG, typename FuncTy, typename CallTy>
800 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::unsetNodeForInst(
801     const CallInfo &C) {
802   AllocationCallToContextNodeMap.erase(C) ||
803       NonAllocationCallToContextNodeMap.erase(C);
804   assert(!AllocationCallToContextNodeMap.count(C) &&
805          !NonAllocationCallToContextNodeMap.count(C));
806 }
807 
808 template <typename DerivedCCG, typename FuncTy, typename CallTy>
809 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
810     addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
811                           unsigned int ContextId) {
812   for (auto &Edge : CallerEdges) {
813     if (Edge->Caller == Caller) {
814       Edge->AllocTypes |= (uint8_t)AllocType;
815       Edge->getContextIds().insert(ContextId);
816       return;
817     }
818   }
819   std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
820       this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
821   CallerEdges.push_back(Edge);
822   Caller->CalleeEdges.push_back(Edge);
823 }
824 
825 template <typename DerivedCCG, typename FuncTy, typename CallTy>
826 void CallsiteContextGraph<
827     DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
828   for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
829     auto Edge = *EI;
830     if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
831       assert(Edge->ContextIds.empty());
832       Edge->Callee->eraseCallerEdge(Edge.get());
833       EI = Node->CalleeEdges.erase(EI);
834     } else
835       ++EI;
836   }
837 }
838 
839 template <typename DerivedCCG, typename FuncTy, typename CallTy>
840 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
841 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
842     findEdgeFromCallee(const ContextNode *Callee) {
843   for (const auto &Edge : CalleeEdges)
844     if (Edge->Callee == Callee)
845       return Edge.get();
846   return nullptr;
847 }
848 
849 template <typename DerivedCCG, typename FuncTy, typename CallTy>
850 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
851 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
852     findEdgeFromCaller(const ContextNode *Caller) {
853   for (const auto &Edge : CallerEdges)
854     if (Edge->Caller == Caller)
855       return Edge.get();
856   return nullptr;
857 }
858 
859 template <typename DerivedCCG, typename FuncTy, typename CallTy>
860 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
861     eraseCalleeEdge(const ContextEdge *Edge) {
862   auto EI = llvm::find_if(
863       CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
864         return CalleeEdge.get() == Edge;
865       });
866   assert(EI != CalleeEdges.end());
867   CalleeEdges.erase(EI);
868 }
869 
870 template <typename DerivedCCG, typename FuncTy, typename CallTy>
871 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
872     eraseCallerEdge(const ContextEdge *Edge) {
873   auto EI = llvm::find_if(
874       CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
875         return CallerEdge.get() == Edge;
876       });
877   assert(EI != CallerEdges.end());
878   CallerEdges.erase(EI);
879 }
880 
881 template <typename DerivedCCG, typename FuncTy, typename CallTy>
882 uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
883     DenseSet<uint32_t> &ContextIds) {
884   uint8_t BothTypes =
885       (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
886   uint8_t AllocType = (uint8_t)AllocationType::None;
887   for (auto Id : ContextIds) {
888     AllocType |= (uint8_t)ContextIdToAllocationType[Id];
889     // Bail early if alloc type reached both, no further refinement.
890     if (AllocType == BothTypes)
891       return AllocType;
892   }
893   return AllocType;
894 }
895 
896 template <typename DerivedCCG, typename FuncTy, typename CallTy>
897 uint8_t
898 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
899     const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
900   uint8_t BothTypes =
901       (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
902   uint8_t AllocType = (uint8_t)AllocationType::None;
903   for (auto Id : Node1Ids) {
904     if (!Node2Ids.count(Id))
905       continue;
906     AllocType |= (uint8_t)ContextIdToAllocationType[Id];
907     // Bail early if alloc type reached both, no further refinement.
908     if (AllocType == BothTypes)
909       return AllocType;
910   }
911   return AllocType;
912 }
913 
914 template <typename DerivedCCG, typename FuncTy, typename CallTy>
915 uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
916     const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
917   if (Node1Ids.size() < Node2Ids.size())
918     return intersectAllocTypesImpl(Node1Ids, Node2Ids);
919   else
920     return intersectAllocTypesImpl(Node2Ids, Node1Ids);
921 }
922 
923 template <typename DerivedCCG, typename FuncTy, typename CallTy>
924 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
925 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
926     CallInfo Call, const FuncTy *F) {
927   assert(!getNodeForAlloc(Call));
928   NodeOwner.push_back(
929       std::make_unique<ContextNode>(/*IsAllocation=*/true, Call));
930   ContextNode *AllocNode = NodeOwner.back().get();
931   AllocationCallToContextNodeMap[Call] = AllocNode;
932   NodeToCallingFunc[AllocNode] = F;
933   // Use LastContextId as a uniq id for MIB allocation nodes.
934   AllocNode->OrigStackOrAllocId = LastContextId;
935   // Alloc type should be updated as we add in the MIBs. We should assert
936   // afterwards that it is not still None.
937   AllocNode->AllocTypes = (uint8_t)AllocationType::None;
938 
939   return AllocNode;
940 }
941 
942 template <typename DerivedCCG, typename FuncTy, typename CallTy>
943 template <class NodeT, class IteratorT>
944 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
945     ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
946     CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType) {
947   // Treating the hot alloc type as NotCold before the disambiguation for "hot"
948   // is done.
949   if (AllocType == AllocationType::Hot)
950     AllocType = AllocationType::NotCold;
951 
952   ContextIdToAllocationType[++LastContextId] = AllocType;
953 
954   // Update alloc type and context ids for this MIB.
955   AllocNode->AllocTypes |= (uint8_t)AllocType;
956   AllocNode->ContextIds.insert(LastContextId);
957 
958   // Now add or update nodes for each stack id in alloc's context.
959   // Later when processing the stack ids on non-alloc callsites we will adjust
960   // for any inlining in the context.
961   ContextNode *PrevNode = AllocNode;
962   // Look for recursion (direct recursion should have been collapsed by
963   // module summary analysis, here we should just be detecting mutual
964   // recursion). Mark these nodes so we don't try to clone.
965   SmallSet<uint64_t, 8> StackIdSet;
966   // Skip any on the allocation call (inlining).
967   for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
968        ContextIter != StackContext.end(); ++ContextIter) {
969     auto StackId = getStackId(*ContextIter);
970     ContextNode *StackNode = getNodeForStackId(StackId);
971     if (!StackNode) {
972       NodeOwner.push_back(
973           std::make_unique<ContextNode>(/*IsAllocation=*/false));
974       StackNode = NodeOwner.back().get();
975       StackEntryIdToContextNodeMap[StackId] = StackNode;
976       StackNode->OrigStackOrAllocId = StackId;
977     }
978     auto Ins = StackIdSet.insert(StackId);
979     if (!Ins.second)
980       StackNode->Recursive = true;
981     StackNode->ContextIds.insert(LastContextId);
982     StackNode->AllocTypes |= (uint8_t)AllocType;
983     PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
984     PrevNode = StackNode;
985   }
986 }
987 
988 template <typename DerivedCCG, typename FuncTy, typename CallTy>
989 DenseSet<uint32_t>
990 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
991     const DenseSet<uint32_t> &StackSequenceContextIds,
992     DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
993   DenseSet<uint32_t> NewContextIds;
994   for (auto OldId : StackSequenceContextIds) {
995     NewContextIds.insert(++LastContextId);
996     OldToNewContextIds[OldId].insert(LastContextId);
997     assert(ContextIdToAllocationType.count(OldId));
998     // The new context has the same allocation type as original.
999     ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1000   }
1001   return NewContextIds;
1002 }
1003 
1004 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1005 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1006     propagateDuplicateContextIds(
1007         const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1008   // Build a set of duplicated context ids corresponding to the input id set.
1009   auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1010     DenseSet<uint32_t> NewIds;
1011     for (auto Id : ContextIds)
1012       if (auto NewId = OldToNewContextIds.find(Id);
1013           NewId != OldToNewContextIds.end())
1014         NewIds.insert(NewId->second.begin(), NewId->second.end());
1015     return NewIds;
1016   };
1017 
1018   // Recursively update context ids sets along caller edges.
1019   auto UpdateCallers = [&](ContextNode *Node,
1020                            DenseSet<const ContextEdge *> &Visited,
1021                            auto &&UpdateCallers) -> void {
1022     for (const auto &Edge : Node->CallerEdges) {
1023       auto Inserted = Visited.insert(Edge.get());
1024       if (!Inserted.second)
1025         continue;
1026       ContextNode *NextNode = Edge->Caller;
1027       DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1028       // Only need to recursively iterate to NextNode via this caller edge if
1029       // it resulted in any added ids to NextNode.
1030       if (!NewIdsToAdd.empty()) {
1031         Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1032         NextNode->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1033         UpdateCallers(NextNode, Visited, UpdateCallers);
1034       }
1035     }
1036   };
1037 
1038   DenseSet<const ContextEdge *> Visited;
1039   for (auto &Entry : AllocationCallToContextNodeMap) {
1040     auto *Node = Entry.second;
1041     // Update ids on the allocation nodes before calling the recursive
1042     // update along caller edges, since this simplifies the logic during
1043     // that traversal.
1044     DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Node->ContextIds);
1045     Node->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1046     UpdateCallers(Node, Visited, UpdateCallers);
1047   }
1048 }
1049 
1050 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1051 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1052     ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee) {
1053   // Make a copy of the context ids, since this will be adjusted below as they
1054   // are moved.
1055   DenseSet<uint32_t> RemainingContextIds = NewNode->ContextIds;
1056   auto &OrigEdges =
1057       TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1058   // Increment iterator in loop so that we can remove edges as needed.
1059   for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1060     auto Edge = *EI;
1061     // Remove any matching context ids from Edge, return set that were found and
1062     // removed, these are the new edge's context ids. Also update the remaining
1063     // (not found ids).
1064     DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds;
1065     set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1066                  NotFoundContextIds);
1067     RemainingContextIds.swap(NotFoundContextIds);
1068     // If no matching context ids for this edge, skip it.
1069     if (NewEdgeContextIds.empty()) {
1070       ++EI;
1071       continue;
1072     }
1073     if (TowardsCallee) {
1074       auto NewEdge = std::make_shared<ContextEdge>(
1075           Edge->Callee, NewNode, computeAllocType(NewEdgeContextIds),
1076           NewEdgeContextIds);
1077       NewNode->CalleeEdges.push_back(NewEdge);
1078       NewEdge->Callee->CallerEdges.push_back(NewEdge);
1079     } else {
1080       auto NewEdge = std::make_shared<ContextEdge>(
1081           NewNode, Edge->Caller, computeAllocType(NewEdgeContextIds),
1082           NewEdgeContextIds);
1083       NewNode->CallerEdges.push_back(NewEdge);
1084       NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1085     }
1086     // Remove old edge if context ids empty.
1087     if (Edge->getContextIds().empty()) {
1088       if (TowardsCallee) {
1089         Edge->Callee->eraseCallerEdge(Edge.get());
1090         EI = OrigNode->CalleeEdges.erase(EI);
1091       } else {
1092         Edge->Caller->eraseCalleeEdge(Edge.get());
1093         EI = OrigNode->CallerEdges.erase(EI);
1094       }
1095       continue;
1096     }
1097     ++EI;
1098   }
1099 }
1100 
1101 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1102 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1103     assignStackNodesPostOrder(ContextNode *Node,
1104                               DenseSet<const ContextNode *> &Visited,
1105                               DenseMap<uint64_t, std::vector<CallContextInfo>>
1106                                   &StackIdToMatchingCalls) {
1107   auto Inserted = Visited.insert(Node);
1108   if (!Inserted.second)
1109     return;
1110   // Post order traversal. Iterate over a copy since we may add nodes and
1111   // therefore new callers during the recursive call, invalidating any
1112   // iterator over the original edge vector. We don't need to process these
1113   // new nodes as they were already processed on creation.
1114   auto CallerEdges = Node->CallerEdges;
1115   for (auto &Edge : CallerEdges) {
1116     // Skip any that have been removed during the recursion.
1117     if (!Edge)
1118       continue;
1119     assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls);
1120   }
1121 
1122   // If this node's stack id is in the map, update the graph to contain new
1123   // nodes representing any inlining at interior callsites. Note we move the
1124   // associated context ids over to the new nodes.
1125 
1126   // Ignore this node if it is for an allocation or we didn't record any
1127   // stack id lists ending at it.
1128   if (Node->IsAllocation ||
1129       !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1130     return;
1131 
1132   auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1133   // Handle the simple case first. A single call with a single stack id.
1134   // In this case there is no need to create any new context nodes, simply
1135   // assign the context node for stack id to this Call.
1136   if (Calls.size() == 1) {
1137     auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1138     if (Ids.size() == 1) {
1139       assert(SavedContextIds.empty());
1140       // It should be this Node
1141       assert(Node == getNodeForStackId(Ids[0]));
1142       if (Node->Recursive)
1143         return;
1144       Node->setCall(Call);
1145       NonAllocationCallToContextNodeMap[Call] = Node;
1146       NodeToCallingFunc[Node] = Func;
1147       return;
1148     }
1149   }
1150 
1151   // Find the node for the last stack id, which should be the same
1152   // across all calls recorded for this id, and is this node's id.
1153   uint64_t LastId = Node->OrigStackOrAllocId;
1154   ContextNode *LastNode = getNodeForStackId(LastId);
1155   // We should only have kept stack ids that had nodes.
1156   assert(LastNode);
1157 
1158   for (unsigned I = 0; I < Calls.size(); I++) {
1159     auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1160     // Skip any for which we didn't assign any ids, these don't get a node in
1161     // the graph.
1162     if (SavedContextIds.empty())
1163       continue;
1164 
1165     assert(LastId == Ids.back());
1166 
1167     ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1168     assert(FirstNode);
1169 
1170     // Recompute the context ids for this stack id sequence (the
1171     // intersection of the context ids of the corresponding nodes).
1172     // Start with the ids we saved in the map for this call, which could be
1173     // duplicated context ids. We have to recompute as we might have overlap
1174     // overlap between the saved context ids for different last nodes, and
1175     // removed them already during the post order traversal.
1176     set_intersect(SavedContextIds, FirstNode->ContextIds);
1177     ContextNode *PrevNode = nullptr;
1178     for (auto Id : Ids) {
1179       ContextNode *CurNode = getNodeForStackId(Id);
1180       // We should only have kept stack ids that had nodes and weren't
1181       // recursive.
1182       assert(CurNode);
1183       assert(!CurNode->Recursive);
1184       if (!PrevNode) {
1185         PrevNode = CurNode;
1186         continue;
1187       }
1188       auto *Edge = CurNode->findEdgeFromCallee(PrevNode);
1189       if (!Edge) {
1190         SavedContextIds.clear();
1191         break;
1192       }
1193       PrevNode = CurNode;
1194       set_intersect(SavedContextIds, Edge->getContextIds());
1195 
1196       // If we now have no context ids for clone, skip this call.
1197       if (SavedContextIds.empty())
1198         break;
1199     }
1200     if (SavedContextIds.empty())
1201       continue;
1202 
1203     // Create new context node.
1204     NodeOwner.push_back(
1205         std::make_unique<ContextNode>(/*IsAllocation=*/false, Call));
1206     ContextNode *NewNode = NodeOwner.back().get();
1207     NodeToCallingFunc[NewNode] = Func;
1208     NonAllocationCallToContextNodeMap[Call] = NewNode;
1209     NewNode->ContextIds = SavedContextIds;
1210     NewNode->AllocTypes = computeAllocType(NewNode->ContextIds);
1211 
1212     // Connect to callees of innermost stack frame in inlined call chain.
1213     // This updates context ids for FirstNode's callee's to reflect those
1214     // moved to NewNode.
1215     connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true);
1216 
1217     // Connect to callers of outermost stack frame in inlined call chain.
1218     // This updates context ids for FirstNode's caller's to reflect those
1219     // moved to NewNode.
1220     connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false);
1221 
1222     // Now we need to remove context ids from edges/nodes between First and
1223     // Last Node.
1224     PrevNode = nullptr;
1225     for (auto Id : Ids) {
1226       ContextNode *CurNode = getNodeForStackId(Id);
1227       // We should only have kept stack ids that had nodes.
1228       assert(CurNode);
1229 
1230       // Remove the context ids moved to NewNode from CurNode, and the
1231       // edge from the prior node.
1232       set_subtract(CurNode->ContextIds, NewNode->ContextIds);
1233       if (PrevNode) {
1234         auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1235         assert(PrevEdge);
1236         set_subtract(PrevEdge->getContextIds(), NewNode->ContextIds);
1237         if (PrevEdge->getContextIds().empty()) {
1238           PrevNode->eraseCallerEdge(PrevEdge);
1239           CurNode->eraseCalleeEdge(PrevEdge);
1240         }
1241       }
1242       PrevNode = CurNode;
1243     }
1244   }
1245 }
1246 
1247 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1248 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1249   // Map of stack id to all calls with that as the last (outermost caller)
1250   // callsite id that has a context node (some might not due to pruning
1251   // performed during matching of the allocation profile contexts).
1252   // The CallContextInfo contains the Call and a list of its stack ids with
1253   // ContextNodes, the function containing Call, and the set of context ids
1254   // the analysis will eventually identify for use in any new node created
1255   // for that callsite.
1256   DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
1257   for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1258     for (auto &Call : CallsWithMetadata) {
1259       // Ignore allocations, already handled.
1260       if (AllocationCallToContextNodeMap.count(Call))
1261         continue;
1262       auto StackIdsWithContextNodes =
1263           getStackIdsWithContextNodesForCall(Call.call());
1264       // If there were no nodes created for MIBs on allocs (maybe this was in
1265       // the unambiguous part of the MIB stack that was pruned), ignore.
1266       if (StackIdsWithContextNodes.empty())
1267         continue;
1268       // Otherwise, record this Call along with the list of ids for the last
1269       // (outermost caller) stack id with a node.
1270       StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1271           {Call.call(), StackIdsWithContextNodes, Func, {}});
1272     }
1273   }
1274 
1275   // First make a pass through all stack ids that correspond to a call,
1276   // as identified in the above loop. Compute the context ids corresponding to
1277   // each of these calls when they correspond to multiple stack ids due to
1278   // due to inlining. Perform any duplication of context ids required when
1279   // there is more than one call with the same stack ids. Their (possibly newly
1280   // duplicated) context ids are saved in the StackIdToMatchingCalls map.
1281   DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1282   for (auto &It : StackIdToMatchingCalls) {
1283     auto &Calls = It.getSecond();
1284     // Skip single calls with a single stack id. These don't need a new node.
1285     if (Calls.size() == 1) {
1286       auto &Ids = std::get<1>(Calls[0]);
1287       if (Ids.size() == 1)
1288         continue;
1289     }
1290     // In order to do the best and maximal matching of inlined calls to context
1291     // node sequences we will sort the vectors of stack ids in descending order
1292     // of length, and within each length, lexicographically by stack id. The
1293     // latter is so that we can specially handle calls that have identical stack
1294     // id sequences (either due to cloning or artificially because of the MIB
1295     // context pruning).
1296     std::stable_sort(Calls.begin(), Calls.end(),
1297                      [](const CallContextInfo &A, const CallContextInfo &B) {
1298                        auto &IdsA = std::get<1>(A);
1299                        auto &IdsB = std::get<1>(B);
1300                        return IdsA.size() > IdsB.size() ||
1301                               (IdsA.size() == IdsB.size() && IdsA < IdsB);
1302                      });
1303 
1304     // Find the node for the last stack id, which should be the same
1305     // across all calls recorded for this id, and is the id for this
1306     // entry in the StackIdToMatchingCalls map.
1307     uint64_t LastId = It.getFirst();
1308     ContextNode *LastNode = getNodeForStackId(LastId);
1309     // We should only have kept stack ids that had nodes.
1310     assert(LastNode);
1311 
1312     if (LastNode->Recursive)
1313       continue;
1314 
1315     // Initialize the context ids with the last node's. We will subsequently
1316     // refine the context ids by computing the intersection along all edges.
1317     DenseSet<uint32_t> LastNodeContextIds = LastNode->ContextIds;
1318     assert(!LastNodeContextIds.empty());
1319 
1320     for (unsigned I = 0; I < Calls.size(); I++) {
1321       auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1322       assert(SavedContextIds.empty());
1323       assert(LastId == Ids.back());
1324 
1325       // First compute the context ids for this stack id sequence (the
1326       // intersection of the context ids of the corresponding nodes).
1327       // Start with the remaining saved ids for the last node.
1328       assert(!LastNodeContextIds.empty());
1329       DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1330 
1331       ContextNode *PrevNode = LastNode;
1332       ContextNode *CurNode = LastNode;
1333       bool Skip = false;
1334 
1335       // Iterate backwards through the stack Ids, starting after the last Id
1336       // in the list, which was handled once outside for all Calls.
1337       for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1338         auto Id = *IdIter;
1339         CurNode = getNodeForStackId(Id);
1340         // We should only have kept stack ids that had nodes.
1341         assert(CurNode);
1342 
1343         if (CurNode->Recursive) {
1344           Skip = true;
1345           break;
1346         }
1347 
1348         auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1349         // If there is no edge then the nodes belong to different MIB contexts,
1350         // and we should skip this inlined context sequence. For example, this
1351         // particular inlined context may include stack ids A->B, and we may
1352         // indeed have nodes for both A and B, but it is possible that they were
1353         // never profiled in sequence in a single MIB for any allocation (i.e.
1354         // we might have profiled an allocation that involves the callsite A,
1355         // but through a different one of its callee callsites, and we might
1356         // have profiled an allocation that involves callsite B, but reached
1357         // from a different caller callsite).
1358         if (!Edge) {
1359           Skip = true;
1360           break;
1361         }
1362         PrevNode = CurNode;
1363 
1364         // Update the context ids, which is the intersection of the ids along
1365         // all edges in the sequence.
1366         set_intersect(StackSequenceContextIds, Edge->getContextIds());
1367 
1368         // If we now have no context ids for clone, skip this call.
1369         if (StackSequenceContextIds.empty()) {
1370           Skip = true;
1371           break;
1372         }
1373       }
1374       if (Skip)
1375         continue;
1376 
1377       // If some of this call's stack ids did not have corresponding nodes (due
1378       // to pruning), don't include any context ids for contexts that extend
1379       // beyond these nodes. Otherwise we would be matching part of unrelated /
1380       // not fully matching stack contexts. To do this, subtract any context ids
1381       // found in caller nodes of the last node found above.
1382       if (Ids.back() != getLastStackId(Call)) {
1383         for (const auto &PE : CurNode->CallerEdges) {
1384           set_subtract(StackSequenceContextIds, PE->getContextIds());
1385           if (StackSequenceContextIds.empty())
1386             break;
1387         }
1388         // If we now have no context ids for clone, skip this call.
1389         if (StackSequenceContextIds.empty())
1390           continue;
1391       }
1392 
1393       // Check if the next set of stack ids is the same (since the Calls vector
1394       // of tuples is sorted by the stack ids we can just look at the next one).
1395       bool DuplicateContextIds = false;
1396       if (I + 1 < Calls.size()) {
1397         auto NextIds = std::get<1>(Calls[I + 1]);
1398         DuplicateContextIds = Ids == NextIds;
1399       }
1400 
1401       // If we don't have duplicate context ids, then we can assign all the
1402       // context ids computed for the original node sequence to this call.
1403       // If there are duplicate calls with the same stack ids then we synthesize
1404       // new context ids that are duplicates of the originals. These are
1405       // assigned to SavedContextIds, which is a reference into the map entry
1406       // for this call, allowing us to access these ids later on.
1407       OldToNewContextIds.reserve(OldToNewContextIds.size() +
1408                                  StackSequenceContextIds.size());
1409       SavedContextIds =
1410           DuplicateContextIds
1411               ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
1412               : StackSequenceContextIds;
1413       assert(!SavedContextIds.empty());
1414 
1415       if (!DuplicateContextIds) {
1416         // Update saved last node's context ids to remove those that are
1417         // assigned to other calls, so that it is ready for the next call at
1418         // this stack id.
1419         set_subtract(LastNodeContextIds, StackSequenceContextIds);
1420         if (LastNodeContextIds.empty())
1421           break;
1422       }
1423     }
1424   }
1425 
1426   // Propagate the duplicate context ids over the graph.
1427   propagateDuplicateContextIds(OldToNewContextIds);
1428 
1429   if (VerifyCCG)
1430     check();
1431 
1432   // Now perform a post-order traversal over the graph, starting with the
1433   // allocation nodes, essentially processing nodes from callers to callees.
1434   // For any that contains an id in the map, update the graph to contain new
1435   // nodes representing any inlining at interior callsites. Note we move the
1436   // associated context ids over to the new nodes.
1437   DenseSet<const ContextNode *> Visited;
1438   for (auto &Entry : AllocationCallToContextNodeMap)
1439     assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls);
1440 }
1441 
1442 uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
1443   CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
1444       Call->getMetadata(LLVMContext::MD_callsite));
1445   return CallsiteContext.back();
1446 }
1447 
1448 uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
1449   assert(isa<CallsiteInfo *>(Call.getBase()));
1450   CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
1451       CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1452   // Need to convert index into stack id.
1453   return Index.getStackIdAtIndex(CallsiteContext.back());
1454 }
1455 
1456 static const std::string MemProfCloneSuffix = ".memprof.";
1457 
1458 static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
1459   // We use CloneNo == 0 to refer to the original version, which doesn't get
1460   // renamed with a suffix.
1461   if (!CloneNo)
1462     return Base.str();
1463   return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
1464 }
1465 
1466 std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
1467                                                  const Instruction *Call,
1468                                                  unsigned CloneNo) const {
1469   return (Twine(Call->getFunction()->getName()) + " -> " +
1470           cast<CallBase>(Call)->getCalledFunction()->getName())
1471       .str();
1472 }
1473 
1474 std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
1475                                                 const IndexCall &Call,
1476                                                 unsigned CloneNo) const {
1477   auto VI = FSToVIMap.find(Func);
1478   assert(VI != FSToVIMap.end());
1479   if (isa<AllocInfo *>(Call.getBase()))
1480     return (VI->second.name() + " -> alloc").str();
1481   else {
1482     auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call.getBase());
1483     return (VI->second.name() + " -> " +
1484             getMemProfFuncName(Callsite->Callee.name(),
1485                                Callsite->Clones[CloneNo]))
1486         .str();
1487   }
1488 }
1489 
1490 std::vector<uint64_t>
1491 ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
1492     Instruction *Call) {
1493   CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
1494       Call->getMetadata(LLVMContext::MD_callsite));
1495   return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
1496       CallsiteContext);
1497 }
1498 
1499 std::vector<uint64_t>
1500 IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
1501   assert(isa<CallsiteInfo *>(Call.getBase()));
1502   CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
1503       CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1504   return getStackIdsWithContextNodes<CallsiteInfo,
1505                                      SmallVector<unsigned>::const_iterator>(
1506       CallsiteContext);
1507 }
1508 
1509 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1510 template <class NodeT, class IteratorT>
1511 std::vector<uint64_t>
1512 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
1513     CallStack<NodeT, IteratorT> &CallsiteContext) {
1514   std::vector<uint64_t> StackIds;
1515   for (auto IdOrIndex : CallsiteContext) {
1516     auto StackId = getStackId(IdOrIndex);
1517     ContextNode *Node = getNodeForStackId(StackId);
1518     if (!Node)
1519       break;
1520     StackIds.push_back(StackId);
1521   }
1522   return StackIds;
1523 }
1524 
1525 ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
1526     Module &M, function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
1527     : Mod(M), OREGetter(OREGetter) {
1528   for (auto &F : M) {
1529     std::vector<CallInfo> CallsWithMetadata;
1530     for (auto &BB : F) {
1531       for (auto &I : BB) {
1532         if (!isa<CallBase>(I))
1533           continue;
1534         if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
1535           CallsWithMetadata.push_back(&I);
1536           auto *AllocNode = addAllocNode(&I, &F);
1537           auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
1538           assert(CallsiteMD);
1539           CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
1540           // Add all of the MIBs and their stack nodes.
1541           for (auto &MDOp : MemProfMD->operands()) {
1542             auto *MIBMD = cast<const MDNode>(MDOp);
1543             MDNode *StackNode = getMIBStackNode(MIBMD);
1544             assert(StackNode);
1545             CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
1546             addStackNodesForMIB<MDNode, MDNode::op_iterator>(
1547                 AllocNode, StackContext, CallsiteContext,
1548                 getMIBAllocType(MIBMD));
1549           }
1550           assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1551           // Memprof and callsite metadata on memory allocations no longer
1552           // needed.
1553           I.setMetadata(LLVMContext::MD_memprof, nullptr);
1554           I.setMetadata(LLVMContext::MD_callsite, nullptr);
1555         }
1556         // For callsite metadata, add to list for this function for later use.
1557         else if (I.getMetadata(LLVMContext::MD_callsite))
1558           CallsWithMetadata.push_back(&I);
1559       }
1560     }
1561     if (!CallsWithMetadata.empty())
1562       FuncToCallsWithMetadata[&F] = CallsWithMetadata;
1563   }
1564 
1565   if (DumpCCG) {
1566     dbgs() << "CCG before updating call stack chains:\n";
1567     dbgs() << *this;
1568   }
1569 
1570   if (ExportToDot)
1571     exportToDot("prestackupdate");
1572 
1573   updateStackNodes();
1574 
1575   handleCallsitesWithMultipleTargets();
1576 
1577   // Strip off remaining callsite metadata, no longer needed.
1578   for (auto &FuncEntry : FuncToCallsWithMetadata)
1579     for (auto &Call : FuncEntry.second)
1580       Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
1581 }
1582 
1583 IndexCallsiteContextGraph::IndexCallsiteContextGraph(
1584     ModuleSummaryIndex &Index,
1585     function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1586         isPrevailing)
1587     : Index(Index), isPrevailing(isPrevailing) {
1588   for (auto &I : Index) {
1589     auto VI = Index.getValueInfo(I);
1590     for (auto &S : VI.getSummaryList()) {
1591       // We should only add the prevailing nodes. Otherwise we may try to clone
1592       // in a weak copy that won't be linked (and may be different than the
1593       // prevailing version).
1594       // We only keep the memprof summary on the prevailing copy now when
1595       // building the combined index, as a space optimization, however don't
1596       // rely on this optimization. The linker doesn't resolve local linkage
1597       // values so don't check whether those are prevailing.
1598       if (!GlobalValue::isLocalLinkage(S->linkage()) &&
1599           !isPrevailing(VI.getGUID(), S.get()))
1600         continue;
1601       auto *FS = dyn_cast<FunctionSummary>(S.get());
1602       if (!FS)
1603         continue;
1604       std::vector<CallInfo> CallsWithMetadata;
1605       if (!FS->allocs().empty()) {
1606         for (auto &AN : FS->mutableAllocs()) {
1607           // This can happen because of recursion elimination handling that
1608           // currently exists in ModuleSummaryAnalysis. Skip these for now.
1609           // We still added them to the summary because we need to be able to
1610           // correlate properly in applyImport in the backends.
1611           if (AN.MIBs.empty())
1612             continue;
1613           CallsWithMetadata.push_back({&AN});
1614           auto *AllocNode = addAllocNode({&AN}, FS);
1615           // Pass an empty CallStack to the CallsiteContext (second)
1616           // parameter, since for ThinLTO we already collapsed out the inlined
1617           // stack ids on the allocation call during ModuleSummaryAnalysis.
1618           CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
1619               EmptyContext;
1620           // Now add all of the MIBs and their stack nodes.
1621           for (auto &MIB : AN.MIBs) {
1622             CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
1623                 StackContext(&MIB);
1624             addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
1625                 AllocNode, StackContext, EmptyContext, MIB.AllocType);
1626           }
1627           assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1628           // Initialize version 0 on the summary alloc node to the current alloc
1629           // type, unless it has both types in which case make it default, so
1630           // that in the case where we aren't able to clone the original version
1631           // always ends up with the default allocation behavior.
1632           AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
1633         }
1634       }
1635       // For callsite metadata, add to list for this function for later use.
1636       if (!FS->callsites().empty())
1637         for (auto &SN : FS->mutableCallsites())
1638           CallsWithMetadata.push_back({&SN});
1639 
1640       if (!CallsWithMetadata.empty())
1641         FuncToCallsWithMetadata[FS] = CallsWithMetadata;
1642 
1643       if (!FS->allocs().empty() || !FS->callsites().empty())
1644         FSToVIMap[FS] = VI;
1645     }
1646   }
1647 
1648   if (DumpCCG) {
1649     dbgs() << "CCG before updating call stack chains:\n";
1650     dbgs() << *this;
1651   }
1652 
1653   if (ExportToDot)
1654     exportToDot("prestackupdate");
1655 
1656   updateStackNodes();
1657 
1658   handleCallsitesWithMultipleTargets();
1659 }
1660 
1661 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1662 void CallsiteContextGraph<DerivedCCG, FuncTy,
1663                           CallTy>::handleCallsitesWithMultipleTargets() {
1664   // Look for and workaround callsites that call multiple functions.
1665   // This can happen for indirect calls, which needs better handling, and in
1666   // more rare cases (e.g. macro expansion).
1667   // TODO: To fix this for indirect calls we will want to perform speculative
1668   // devirtualization using either the normal PGO info with ICP, or using the
1669   // information in the profiled MemProf contexts. We can do this prior to
1670   // this transformation for regular LTO, and for ThinLTO we can simulate that
1671   // effect in the summary and perform the actual speculative devirtualization
1672   // while cloning in the ThinLTO backend.
1673 
1674   // Keep track of the new nodes synthesized for discovered tail calls missing
1675   // from the profiled contexts.
1676   MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
1677 
1678   for (auto Entry = NonAllocationCallToContextNodeMap.begin();
1679        Entry != NonAllocationCallToContextNodeMap.end();) {
1680     auto *Node = Entry->second;
1681     assert(Node->Clones.empty());
1682     // Check all node callees and see if in the same function.
1683     bool Removed = false;
1684     auto Call = Node->Call.call();
1685     for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1686       auto Edge = *EI;
1687       if (!Edge->Callee->hasCall()) {
1688         ++EI;
1689         continue;
1690       }
1691       assert(NodeToCallingFunc.count(Edge->Callee));
1692       // Check if the called function matches that of the callee node.
1693       if (calleesMatch(Call, EI, TailCallToContextNodeMap))
1694         continue;
1695       RemovedEdgesWithMismatchedCallees++;
1696       // Work around by setting Node to have a null call, so it gets
1697       // skipped during cloning. Otherwise assignFunctions will assert
1698       // because its data structures are not designed to handle this case.
1699       Entry = NonAllocationCallToContextNodeMap.erase(Entry);
1700       Node->setCall(CallInfo());
1701       Removed = true;
1702       break;
1703     }
1704     if (!Removed)
1705       Entry++;
1706   }
1707 
1708   // Add the new nodes after the above loop so that the iteration is not
1709   // invalidated.
1710   for (auto &[Call, Node] : TailCallToContextNodeMap)
1711     NonAllocationCallToContextNodeMap[Call] = Node;
1712 }
1713 
1714 uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1715   // In the Module (IR) case this is already the Id.
1716   return IdOrIndex;
1717 }
1718 
1719 uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1720   // In the Index case this is an index into the stack id list in the summary
1721   // index, convert it to an Id.
1722   return Index.getStackIdAtIndex(IdOrIndex);
1723 }
1724 
1725 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1726 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
1727     CallTy Call, EdgeIter &EI,
1728     MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
1729   auto Edge = *EI;
1730   const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
1731   const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
1732   // Will be populated in order of callee to caller if we find a chain of tail
1733   // calls between the profiled caller and callee.
1734   std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
1735   if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
1736                          FoundCalleeChain)) {
1737     ++EI;
1738     return false;
1739   }
1740 
1741   // The usual case where the profiled callee matches that of the IR/summary.
1742   if (FoundCalleeChain.empty()) {
1743     ++EI;
1744     return true;
1745   }
1746 
1747   auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
1748     auto *CurEdge = Callee->findEdgeFromCaller(Caller);
1749     // If there is already an edge between these nodes, simply update it and
1750     // return.
1751     if (CurEdge) {
1752       CurEdge->ContextIds.insert(Edge->ContextIds.begin(),
1753                                  Edge->ContextIds.end());
1754       CurEdge->AllocTypes |= Edge->AllocTypes;
1755       return;
1756     }
1757     // Otherwise, create a new edge and insert it into the caller and callee
1758     // lists.
1759     auto NewEdge = std::make_shared<ContextEdge>(
1760         Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
1761     Callee->CallerEdges.push_back(NewEdge);
1762     if (Caller == Edge->Caller) {
1763       // If we are inserting the new edge into the current edge's caller, insert
1764       // the new edge before the current iterator position, and then increment
1765       // back to the current edge.
1766       EI = Caller->CalleeEdges.insert(EI, NewEdge);
1767       ++EI;
1768       assert(*EI == Edge &&
1769              "Iterator position not restored after insert and increment");
1770     } else
1771       Caller->CalleeEdges.push_back(NewEdge);
1772   };
1773 
1774   // Create new nodes for each found callee and connect in between the profiled
1775   // caller and callee.
1776   auto *CurCalleeNode = Edge->Callee;
1777   for (auto &[NewCall, Func] : FoundCalleeChain) {
1778     ContextNode *NewNode = nullptr;
1779     // First check if we have already synthesized a node for this tail call.
1780     if (TailCallToContextNodeMap.count(NewCall)) {
1781       NewNode = TailCallToContextNodeMap[NewCall];
1782       NewNode->ContextIds.insert(Edge->ContextIds.begin(),
1783                                  Edge->ContextIds.end());
1784       NewNode->AllocTypes |= Edge->AllocTypes;
1785     } else {
1786       FuncToCallsWithMetadata[Func].push_back({NewCall});
1787       // Create Node and record node info.
1788       NodeOwner.push_back(
1789           std::make_unique<ContextNode>(/*IsAllocation=*/false, NewCall));
1790       NewNode = NodeOwner.back().get();
1791       NodeToCallingFunc[NewNode] = Func;
1792       TailCallToContextNodeMap[NewCall] = NewNode;
1793       NewNode->ContextIds = Edge->ContextIds;
1794       NewNode->AllocTypes = Edge->AllocTypes;
1795     }
1796 
1797     // Hook up node to its callee node
1798     AddEdge(NewNode, CurCalleeNode);
1799 
1800     CurCalleeNode = NewNode;
1801   }
1802 
1803   // Hook up edge's original caller to new callee node.
1804   AddEdge(Edge->Caller, CurCalleeNode);
1805 
1806   // Remove old edge
1807   Edge->Callee->eraseCallerEdge(Edge.get());
1808   EI = Edge->Caller->CalleeEdges.erase(EI);
1809 
1810   return true;
1811 }
1812 
1813 bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
1814     const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
1815     std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
1816     bool &FoundMultipleCalleeChains) {
1817   // Stop recursive search if we have already explored the maximum specified
1818   // depth.
1819   if (Depth > TailCallSearchDepth)
1820     return false;
1821 
1822   auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
1823     FoundCalleeChain.push_back({Callsite, F});
1824   };
1825 
1826   auto *CalleeFunc = dyn_cast<Function>(CurCallee);
1827   if (!CalleeFunc) {
1828     auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
1829     assert(Alias);
1830     CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
1831     assert(CalleeFunc);
1832   }
1833 
1834   // Look for tail calls in this function, and check if they either call the
1835   // profiled callee directly, or indirectly (via a recursive search).
1836   // Only succeed if there is a single unique tail call chain found between the
1837   // profiled caller and callee, otherwise we could perform incorrect cloning.
1838   bool FoundSingleCalleeChain = false;
1839   for (auto &BB : *CalleeFunc) {
1840     for (auto &I : BB) {
1841       auto *CB = dyn_cast<CallBase>(&I);
1842       if (!CB || !CB->isTailCall())
1843         continue;
1844       auto *CalledValue = CB->getCalledOperand();
1845       auto *CalledFunction = CB->getCalledFunction();
1846       if (CalledValue && !CalledFunction) {
1847         CalledValue = CalledValue->stripPointerCasts();
1848         // Stripping pointer casts can reveal a called function.
1849         CalledFunction = dyn_cast<Function>(CalledValue);
1850       }
1851       // Check if this is an alias to a function. If so, get the
1852       // called aliasee for the checks below.
1853       if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
1854         assert(!CalledFunction &&
1855                "Expected null called function in callsite for alias");
1856         CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
1857       }
1858       if (!CalledFunction)
1859         continue;
1860       if (CalledFunction == ProfiledCallee) {
1861         if (FoundSingleCalleeChain) {
1862           FoundMultipleCalleeChains = true;
1863           return false;
1864         }
1865         FoundSingleCalleeChain = true;
1866         FoundProfiledCalleeCount++;
1867         FoundProfiledCalleeDepth += Depth;
1868         if (Depth > FoundProfiledCalleeMaxDepth)
1869           FoundProfiledCalleeMaxDepth = Depth;
1870         SaveCallsiteInfo(&I, CalleeFunc);
1871       } else if (findProfiledCalleeThroughTailCalls(
1872                      ProfiledCallee, CalledFunction, Depth + 1,
1873                      FoundCalleeChain, FoundMultipleCalleeChains)) {
1874         if (FoundMultipleCalleeChains)
1875           return false;
1876         if (FoundSingleCalleeChain) {
1877           FoundMultipleCalleeChains = true;
1878           return false;
1879         }
1880         FoundSingleCalleeChain = true;
1881         SaveCallsiteInfo(&I, CalleeFunc);
1882       }
1883     }
1884   }
1885 
1886   return FoundSingleCalleeChain;
1887 }
1888 
1889 bool ModuleCallsiteContextGraph::calleeMatchesFunc(
1890     Instruction *Call, const Function *Func, const Function *CallerFunc,
1891     std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
1892   auto *CB = dyn_cast<CallBase>(Call);
1893   if (!CB->getCalledOperand())
1894     return false;
1895   auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
1896   auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
1897   if (CalleeFunc == Func)
1898     return true;
1899   auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
1900   if (Alias && Alias->getAliasee() == Func)
1901     return true;
1902 
1903   // Recursively search for the profiled callee through tail calls starting with
1904   // the actual Callee. The discovered tail call chain is saved in
1905   // FoundCalleeChain, and we will fixup the graph to include these callsites
1906   // after returning.
1907   // FIXME: We will currently redo the same recursive walk if we find the same
1908   // mismatched callee from another callsite. We can improve this with more
1909   // bookkeeping of the created chain of new nodes for each mismatch.
1910   unsigned Depth = 1;
1911   bool FoundMultipleCalleeChains = false;
1912   if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
1913                                           FoundCalleeChain,
1914                                           FoundMultipleCalleeChains)) {
1915     LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
1916                       << Func->getName() << " from " << CallerFunc->getName()
1917                       << " that actually called " << CalleeVal->getName()
1918                       << (FoundMultipleCalleeChains
1919                               ? " (found multiple possible chains)"
1920                               : "")
1921                       << "\n");
1922     if (FoundMultipleCalleeChains)
1923       FoundProfiledCalleeNonUniquelyCount++;
1924     return false;
1925   }
1926 
1927   return true;
1928 }
1929 
1930 bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
1931     ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
1932     std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
1933     bool &FoundMultipleCalleeChains) {
1934   // Stop recursive search if we have already explored the maximum specified
1935   // depth.
1936   if (Depth > TailCallSearchDepth)
1937     return false;
1938 
1939   auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
1940     // Make a CallsiteInfo for each discovered callee, if one hasn't already
1941     // been synthesized.
1942     if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
1943         !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
1944       // StackIds is empty (we don't have debug info available in the index for
1945       // these callsites)
1946       FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
1947           std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
1948     CallsiteInfo *NewCallsiteInfo =
1949         FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
1950     FoundCalleeChain.push_back({NewCallsiteInfo, FS});
1951   };
1952 
1953   // Look for tail calls in this function, and check if they either call the
1954   // profiled callee directly, or indirectly (via a recursive search).
1955   // Only succeed if there is a single unique tail call chain found between the
1956   // profiled caller and callee, otherwise we could perform incorrect cloning.
1957   bool FoundSingleCalleeChain = false;
1958   for (auto &S : CurCallee.getSummaryList()) {
1959     if (!GlobalValue::isLocalLinkage(S->linkage()) &&
1960         !isPrevailing(CurCallee.getGUID(), S.get()))
1961       continue;
1962     auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
1963     if (!FS)
1964       continue;
1965     auto FSVI = CurCallee;
1966     auto *AS = dyn_cast<AliasSummary>(S.get());
1967     if (AS)
1968       FSVI = AS->getAliaseeVI();
1969     for (auto &CallEdge : FS->calls()) {
1970       if (!CallEdge.second.hasTailCall())
1971         continue;
1972       if (CallEdge.first == ProfiledCallee) {
1973         if (FoundSingleCalleeChain) {
1974           FoundMultipleCalleeChains = true;
1975           return false;
1976         }
1977         FoundSingleCalleeChain = true;
1978         FoundProfiledCalleeCount++;
1979         FoundProfiledCalleeDepth += Depth;
1980         if (Depth > FoundProfiledCalleeMaxDepth)
1981           FoundProfiledCalleeMaxDepth = Depth;
1982         CreateAndSaveCallsiteInfo(CallEdge.first, FS);
1983         // Add FS to FSToVIMap  in case it isn't already there.
1984         assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
1985         FSToVIMap[FS] = FSVI;
1986       } else if (findProfiledCalleeThroughTailCalls(
1987                      ProfiledCallee, CallEdge.first, Depth + 1,
1988                      FoundCalleeChain, FoundMultipleCalleeChains)) {
1989         if (FoundMultipleCalleeChains)
1990           return false;
1991         if (FoundSingleCalleeChain) {
1992           FoundMultipleCalleeChains = true;
1993           return false;
1994         }
1995         FoundSingleCalleeChain = true;
1996         CreateAndSaveCallsiteInfo(CallEdge.first, FS);
1997         // Add FS to FSToVIMap  in case it isn't already there.
1998         assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
1999         FSToVIMap[FS] = FSVI;
2000       }
2001     }
2002   }
2003 
2004   return FoundSingleCalleeChain;
2005 }
2006 
2007 bool IndexCallsiteContextGraph::calleeMatchesFunc(
2008     IndexCall &Call, const FunctionSummary *Func,
2009     const FunctionSummary *CallerFunc,
2010     std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
2011   ValueInfo Callee =
2012       dyn_cast_if_present<CallsiteInfo *>(Call.getBase())->Callee;
2013   // If there is no summary list then this is a call to an externally defined
2014   // symbol.
2015   AliasSummary *Alias =
2016       Callee.getSummaryList().empty()
2017           ? nullptr
2018           : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
2019   assert(FSToVIMap.count(Func));
2020   auto FuncVI = FSToVIMap[Func];
2021   if (Callee == FuncVI ||
2022       // If callee is an alias, check the aliasee, since only function
2023       // summary base objects will contain the stack node summaries and thus
2024       // get a context node.
2025       (Alias && Alias->getAliaseeVI() == FuncVI))
2026     return true;
2027 
2028   // Recursively search for the profiled callee through tail calls starting with
2029   // the actual Callee. The discovered tail call chain is saved in
2030   // FoundCalleeChain, and we will fixup the graph to include these callsites
2031   // after returning.
2032   // FIXME: We will currently redo the same recursive walk if we find the same
2033   // mismatched callee from another callsite. We can improve this with more
2034   // bookkeeping of the created chain of new nodes for each mismatch.
2035   unsigned Depth = 1;
2036   bool FoundMultipleCalleeChains = false;
2037   if (!findProfiledCalleeThroughTailCalls(
2038           FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
2039     LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
2040                       << " from " << FSToVIMap[CallerFunc]
2041                       << " that actually called " << Callee
2042                       << (FoundMultipleCalleeChains
2043                               ? " (found multiple possible chains)"
2044                               : "")
2045                       << "\n");
2046     if (FoundMultipleCalleeChains)
2047       FoundProfiledCalleeNonUniquelyCount++;
2048     return false;
2049   }
2050 
2051   return true;
2052 }
2053 
2054 static std::string getAllocTypeString(uint8_t AllocTypes) {
2055   if (!AllocTypes)
2056     return "None";
2057   std::string Str;
2058   if (AllocTypes & (uint8_t)AllocationType::NotCold)
2059     Str += "NotCold";
2060   if (AllocTypes & (uint8_t)AllocationType::Cold)
2061     Str += "Cold";
2062   return Str;
2063 }
2064 
2065 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2066 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
2067     const {
2068   print(dbgs());
2069   dbgs() << "\n";
2070 }
2071 
2072 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2073 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
2074     raw_ostream &OS) const {
2075   OS << "Node " << this << "\n";
2076   OS << "\t";
2077   printCall(OS);
2078   if (Recursive)
2079     OS << " (recursive)";
2080   OS << "\n";
2081   OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
2082   OS << "\tContextIds:";
2083   std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2084   std::sort(SortedIds.begin(), SortedIds.end());
2085   for (auto Id : SortedIds)
2086     OS << " " << Id;
2087   OS << "\n";
2088   OS << "\tCalleeEdges:\n";
2089   for (auto &Edge : CalleeEdges)
2090     OS << "\t\t" << *Edge << "\n";
2091   OS << "\tCallerEdges:\n";
2092   for (auto &Edge : CallerEdges)
2093     OS << "\t\t" << *Edge << "\n";
2094   if (!Clones.empty()) {
2095     OS << "\tClones: ";
2096     FieldSeparator FS;
2097     for (auto *Clone : Clones)
2098       OS << FS << Clone;
2099     OS << "\n";
2100   } else if (CloneOf) {
2101     OS << "\tClone of " << CloneOf << "\n";
2102   }
2103 }
2104 
2105 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2106 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
2107     const {
2108   print(dbgs());
2109   dbgs() << "\n";
2110 }
2111 
2112 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2113 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
2114     raw_ostream &OS) const {
2115   OS << "Edge from Callee " << Callee << " to Caller: " << Caller
2116      << " AllocTypes: " << getAllocTypeString(AllocTypes);
2117   OS << " ContextIds:";
2118   std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2119   std::sort(SortedIds.begin(), SortedIds.end());
2120   for (auto Id : SortedIds)
2121     OS << " " << Id;
2122 }
2123 
2124 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2125 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
2126   print(dbgs());
2127 }
2128 
2129 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2130 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
2131     raw_ostream &OS) const {
2132   OS << "Callsite Context Graph:\n";
2133   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2134   for (const auto Node : nodes<GraphType>(this)) {
2135     if (Node->isRemoved())
2136       continue;
2137     Node->print(OS);
2138     OS << "\n";
2139   }
2140 }
2141 
2142 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2143 static void checkEdge(
2144     const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
2145   // Confirm that alloc type is not None and that we have at least one context
2146   // id.
2147   assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
2148   assert(!Edge->ContextIds.empty());
2149 }
2150 
2151 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2152 static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
2153                       bool CheckEdges = true) {
2154   if (Node->isRemoved())
2155     return;
2156   // Node's context ids should be the union of both its callee and caller edge
2157   // context ids.
2158   if (Node->CallerEdges.size()) {
2159     auto EI = Node->CallerEdges.begin();
2160     auto &FirstEdge = *EI;
2161     EI++;
2162     DenseSet<uint32_t> CallerEdgeContextIds(FirstEdge->ContextIds);
2163     for (; EI != Node->CallerEdges.end(); EI++) {
2164       const auto &Edge = *EI;
2165       if (CheckEdges)
2166         checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2167       set_union(CallerEdgeContextIds, Edge->ContextIds);
2168     }
2169     // Node can have more context ids than callers if some contexts terminate at
2170     // node and some are longer.
2171     assert(Node->ContextIds == CallerEdgeContextIds ||
2172            set_is_subset(CallerEdgeContextIds, Node->ContextIds));
2173   }
2174   if (Node->CalleeEdges.size()) {
2175     auto EI = Node->CalleeEdges.begin();
2176     auto &FirstEdge = *EI;
2177     EI++;
2178     DenseSet<uint32_t> CalleeEdgeContextIds(FirstEdge->ContextIds);
2179     for (; EI != Node->CalleeEdges.end(); EI++) {
2180       const auto &Edge = *EI;
2181       if (CheckEdges)
2182         checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2183       set_union(CalleeEdgeContextIds, Edge->ContextIds);
2184     }
2185     assert(Node->ContextIds == CalleeEdgeContextIds);
2186   }
2187 }
2188 
2189 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2190 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
2191   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2192   for (const auto Node : nodes<GraphType>(this)) {
2193     checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2194     for (auto &Edge : Node->CallerEdges)
2195       checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2196   }
2197 }
2198 
2199 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2200 struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
2201   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2202   using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
2203 
2204   using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
2205   static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
2206 
2207   using nodes_iterator =
2208       mapped_iterator<typename std::vector<NodePtrTy>::const_iterator,
2209                       decltype(&getNode)>;
2210 
2211   static nodes_iterator nodes_begin(GraphType G) {
2212     return nodes_iterator(G->NodeOwner.begin(), &getNode);
2213   }
2214 
2215   static nodes_iterator nodes_end(GraphType G) {
2216     return nodes_iterator(G->NodeOwner.end(), &getNode);
2217   }
2218 
2219   static NodeRef getEntryNode(GraphType G) {
2220     return G->NodeOwner.begin()->get();
2221   }
2222 
2223   using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
2224   static const ContextNode<DerivedCCG, FuncTy, CallTy> *
2225   GetCallee(const EdgePtrTy &P) {
2226     return P->Callee;
2227   }
2228 
2229   using ChildIteratorType =
2230       mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
2231                           DerivedCCG, FuncTy, CallTy>>>::const_iterator,
2232                       decltype(&GetCallee)>;
2233 
2234   static ChildIteratorType child_begin(NodeRef N) {
2235     return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
2236   }
2237 
2238   static ChildIteratorType child_end(NodeRef N) {
2239     return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
2240   }
2241 };
2242 
2243 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2244 struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
2245     : public DefaultDOTGraphTraits {
2246   DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
2247 
2248   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2249   using GTraits = GraphTraits<GraphType>;
2250   using NodeRef = typename GTraits::NodeRef;
2251   using ChildIteratorType = typename GTraits::ChildIteratorType;
2252 
2253   static std::string getNodeLabel(NodeRef Node, GraphType G) {
2254     std::string LabelString =
2255         (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
2256          Twine(Node->OrigStackOrAllocId))
2257             .str();
2258     LabelString += "\n";
2259     if (Node->hasCall()) {
2260       auto Func = G->NodeToCallingFunc.find(Node);
2261       assert(Func != G->NodeToCallingFunc.end());
2262       LabelString +=
2263           G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
2264     } else {
2265       LabelString += "null call";
2266       if (Node->Recursive)
2267         LabelString += " (recursive)";
2268       else
2269         LabelString += " (external)";
2270     }
2271     return LabelString;
2272   }
2273 
2274   static std::string getNodeAttributes(NodeRef Node, GraphType) {
2275     std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
2276                                    getContextIds(Node->ContextIds) + "\"")
2277                                       .str();
2278     AttributeString +=
2279         (Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str();
2280     AttributeString += ",style=\"filled\"";
2281     if (Node->CloneOf) {
2282       AttributeString += ",color=\"blue\"";
2283       AttributeString += ",style=\"filled,bold,dashed\"";
2284     } else
2285       AttributeString += ",style=\"filled\"";
2286     return AttributeString;
2287   }
2288 
2289   static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
2290                                        GraphType) {
2291     auto &Edge = *(ChildIter.getCurrent());
2292     return (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
2293             Twine(",fillcolor=\"") + getColor(Edge->AllocTypes) + "\"")
2294         .str();
2295   }
2296 
2297   // Since the NodeOwners list includes nodes that are no longer connected to
2298   // the graph, skip them here.
2299   static bool isNodeHidden(NodeRef Node, GraphType) {
2300     return Node->isRemoved();
2301   }
2302 
2303 private:
2304   static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
2305     std::string IdString = "ContextIds:";
2306     if (ContextIds.size() < 100) {
2307       std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2308       std::sort(SortedIds.begin(), SortedIds.end());
2309       for (auto Id : SortedIds)
2310         IdString += (" " + Twine(Id)).str();
2311     } else {
2312       IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
2313     }
2314     return IdString;
2315   }
2316 
2317   static std::string getColor(uint8_t AllocTypes) {
2318     if (AllocTypes == (uint8_t)AllocationType::NotCold)
2319       // Color "brown1" actually looks like a lighter red.
2320       return "brown1";
2321     if (AllocTypes == (uint8_t)AllocationType::Cold)
2322       return "cyan";
2323     if (AllocTypes ==
2324         ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
2325       // Lighter purple.
2326       return "mediumorchid1";
2327     return "gray";
2328   }
2329 
2330   static std::string getNodeId(NodeRef Node) {
2331     std::stringstream SStream;
2332     SStream << std::hex << "N0x" << (unsigned long long)Node;
2333     std::string Result = SStream.str();
2334     return Result;
2335   }
2336 };
2337 
2338 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2339 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
2340     std::string Label) const {
2341   WriteGraph(this, "", false, Label,
2342              DotFilePathPrefix + "ccg." + Label + ".dot");
2343 }
2344 
2345 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2346 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
2347 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
2348     const std::shared_ptr<ContextEdge> &Edge, EdgeIter *CallerEdgeI) {
2349   ContextNode *Node = Edge->Callee;
2350   NodeOwner.push_back(
2351       std::make_unique<ContextNode>(Node->IsAllocation, Node->Call));
2352   ContextNode *Clone = NodeOwner.back().get();
2353   Node->addClone(Clone);
2354   assert(NodeToCallingFunc.count(Node));
2355   NodeToCallingFunc[Clone] = NodeToCallingFunc[Node];
2356   moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true);
2357   return Clone;
2358 }
2359 
2360 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2361 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
2362     moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
2363                                   ContextNode *NewCallee, EdgeIter *CallerEdgeI,
2364                                   bool NewClone) {
2365   // NewCallee and Edge's current callee must be clones of the same original
2366   // node (Edge's current callee may be the original node too).
2367   assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
2368   auto &EdgeContextIds = Edge->getContextIds();
2369   ContextNode *OldCallee = Edge->Callee;
2370   if (CallerEdgeI)
2371     *CallerEdgeI = OldCallee->CallerEdges.erase(*CallerEdgeI);
2372   else
2373     OldCallee->eraseCallerEdge(Edge.get());
2374   Edge->Callee = NewCallee;
2375   NewCallee->CallerEdges.push_back(Edge);
2376   // Don't need to update Edge's context ids since we are simply reconnecting
2377   // it.
2378   set_subtract(OldCallee->ContextIds, EdgeContextIds);
2379   NewCallee->ContextIds.insert(EdgeContextIds.begin(), EdgeContextIds.end());
2380   NewCallee->AllocTypes |= Edge->AllocTypes;
2381   OldCallee->AllocTypes = computeAllocType(OldCallee->ContextIds);
2382   // OldCallee alloc type should be None iff its context id set is now empty.
2383   assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
2384          OldCallee->ContextIds.empty());
2385   // Now walk the old callee node's callee edges and move Edge's context ids
2386   // over to the corresponding edge into the clone (which is created here if
2387   // this is a newly created clone).
2388   for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
2389     // The context ids moving to the new callee are the subset of this edge's
2390     // context ids and the context ids on the caller edge being moved.
2391     DenseSet<uint32_t> EdgeContextIdsToMove =
2392         set_intersection(OldCalleeEdge->getContextIds(), EdgeContextIds);
2393     set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
2394     OldCalleeEdge->AllocTypes =
2395         computeAllocType(OldCalleeEdge->getContextIds());
2396     if (!NewClone) {
2397       // Update context ids / alloc type on corresponding edge to NewCallee.
2398       // There is a chance this may not exist if we are reusing an existing
2399       // clone, specifically during function assignment, where we would have
2400       // removed none type edges after creating the clone. If we can't find
2401       // a corresponding edge there, fall through to the cloning below.
2402       if (auto *NewCalleeEdge =
2403               NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) {
2404         NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
2405                                               EdgeContextIdsToMove.end());
2406         NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
2407         continue;
2408       }
2409     }
2410     auto NewEdge = std::make_shared<ContextEdge>(
2411         OldCalleeEdge->Callee, NewCallee,
2412         computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);
2413     NewCallee->CalleeEdges.push_back(NewEdge);
2414     NewEdge->Callee->CallerEdges.push_back(NewEdge);
2415   }
2416   if (VerifyCCG) {
2417     checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
2418     checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
2419     for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
2420       checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
2421                                             /*CheckEdges=*/false);
2422     for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
2423       checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
2424                                             /*CheckEdges=*/false);
2425   }
2426 }
2427 
2428 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2429 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
2430   DenseSet<const ContextNode *> Visited;
2431   for (auto &Entry : AllocationCallToContextNodeMap)
2432     identifyClones(Entry.second, Visited);
2433 }
2434 
2435 // helper function to check an AllocType is cold or notcold or both.
2436 bool checkColdOrNotCold(uint8_t AllocType) {
2437   return (AllocType == (uint8_t)AllocationType::Cold) ||
2438          (AllocType == (uint8_t)AllocationType::NotCold) ||
2439          (AllocType ==
2440           ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold));
2441 }
2442 
2443 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2444 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
2445     ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
2446   if (VerifyNodes)
2447     checkNode<DerivedCCG, FuncTy, CallTy>(Node);
2448   assert(!Node->CloneOf);
2449 
2450   // If Node as a null call, then either it wasn't found in the module (regular
2451   // LTO) or summary index (ThinLTO), or there were other conditions blocking
2452   // cloning (e.g. recursion, calls multiple targets, etc).
2453   // Do this here so that we don't try to recursively clone callers below, which
2454   // isn't useful at least for this node.
2455   if (!Node->hasCall())
2456     return;
2457 
2458 #ifndef NDEBUG
2459   auto Insert =
2460 #endif
2461       Visited.insert(Node);
2462   // We should not have visited this node yet.
2463   assert(Insert.second);
2464   // The recursive call to identifyClones may delete the current edge from the
2465   // CallerEdges vector. Make a copy and iterate on that, simpler than passing
2466   // in an iterator and having recursive call erase from it. Other edges may
2467   // also get removed during the recursion, which will have null Callee and
2468   // Caller pointers (and are deleted later), so we skip those below.
2469   {
2470     auto CallerEdges = Node->CallerEdges;
2471     for (auto &Edge : CallerEdges) {
2472       // Skip any that have been removed by an earlier recursive call.
2473       if (Edge->Callee == nullptr && Edge->Caller == nullptr) {
2474         assert(!llvm::count(Node->CallerEdges, Edge));
2475         continue;
2476       }
2477       // Ignore any caller we previously visited via another edge.
2478       if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
2479         identifyClones(Edge->Caller, Visited);
2480       }
2481     }
2482   }
2483 
2484   // Check if we reached an unambiguous call or have have only a single caller.
2485   if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
2486     return;
2487 
2488   // We need to clone.
2489 
2490   // Try to keep the original version as alloc type NotCold. This will make
2491   // cases with indirect calls or any other situation with an unknown call to
2492   // the original function get the default behavior. We do this by sorting the
2493   // CallerEdges of the Node we will clone by alloc type.
2494   //
2495   // Give NotCold edge the lowest sort priority so those edges are at the end of
2496   // the caller edges vector, and stay on the original version (since the below
2497   // code clones greedily until it finds all remaining edges have the same type
2498   // and leaves the remaining ones on the original Node).
2499   //
2500   // We shouldn't actually have any None type edges, so the sorting priority for
2501   // that is arbitrary, and we assert in that case below.
2502   const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
2503                                                /*Cold*/ 1,
2504                                                /*NotColdCold*/ 2};
2505   std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(),
2506                    [&](const std::shared_ptr<ContextEdge> &A,
2507                        const std::shared_ptr<ContextEdge> &B) {
2508                      assert(checkColdOrNotCold(A->AllocTypes) &&
2509                             checkColdOrNotCold(B->AllocTypes));
2510 
2511                      if (A->AllocTypes == B->AllocTypes)
2512                        // Use the first context id for each edge as a
2513                        // tie-breaker.
2514                        return *A->ContextIds.begin() < *B->ContextIds.begin();
2515                      return AllocTypeCloningPriority[A->AllocTypes] <
2516                             AllocTypeCloningPriority[B->AllocTypes];
2517                    });
2518 
2519   assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2520 
2521   // Iterate until we find no more opportunities for disambiguating the alloc
2522   // types via cloning. In most cases this loop will terminate once the Node
2523   // has a single allocation type, in which case no more cloning is needed.
2524   // We need to be able to remove Edge from CallerEdges, so need to adjust
2525   // iterator inside the loop.
2526   for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
2527     auto CallerEdge = *EI;
2528 
2529     // See if cloning the prior caller edge left this node with a single alloc
2530     // type or a single caller. In that case no more cloning of Node is needed.
2531     if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
2532       break;
2533 
2534     // Compute the node callee edge alloc types corresponding to the context ids
2535     // for this caller edge.
2536     std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
2537     CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
2538     for (auto &CalleeEdge : Node->CalleeEdges)
2539       CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
2540           CalleeEdge->getContextIds(), CallerEdge->getContextIds()));
2541 
2542     // Don't clone if doing so will not disambiguate any alloc types amongst
2543     // caller edges (including the callee edges that would be cloned).
2544     // Otherwise we will simply move all edges to the clone.
2545     //
2546     // First check if by cloning we will disambiguate the caller allocation
2547     // type from node's allocation type. Query allocTypeToUse so that we don't
2548     // bother cloning to distinguish NotCold+Cold from NotCold. Note that
2549     // neither of these should be None type.
2550     //
2551     // Then check if by cloning node at least one of the callee edges will be
2552     // disambiguated by splitting out different context ids.
2553     assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
2554     assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2555     if (allocTypeToUse(CallerEdge->AllocTypes) ==
2556             allocTypeToUse(Node->AllocTypes) &&
2557         allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2558             CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
2559       ++EI;
2560       continue;
2561     }
2562 
2563     // First see if we can use an existing clone. Check each clone and its
2564     // callee edges for matching alloc types.
2565     ContextNode *Clone = nullptr;
2566     for (auto *CurClone : Node->Clones) {
2567       if (allocTypeToUse(CurClone->AllocTypes) !=
2568           allocTypeToUse(CallerEdge->AllocTypes))
2569         continue;
2570 
2571       if (!allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2572               CalleeEdgeAllocTypesForCallerEdge, CurClone->CalleeEdges))
2573         continue;
2574       Clone = CurClone;
2575       break;
2576     }
2577 
2578     // The edge iterator is adjusted when we move the CallerEdge to the clone.
2579     if (Clone)
2580       moveEdgeToExistingCalleeClone(CallerEdge, Clone, &EI);
2581     else
2582       Clone = moveEdgeToNewCalleeClone(CallerEdge, &EI);
2583 
2584     assert(EI == Node->CallerEdges.end() ||
2585            Node->AllocTypes != (uint8_t)AllocationType::None);
2586     // Sanity check that no alloc types on clone or its edges are None.
2587     assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
2588     assert(llvm::none_of(
2589         Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
2590           return E->AllocTypes == (uint8_t)AllocationType::None;
2591         }));
2592   }
2593 
2594   // Cloning may have resulted in some cloned callee edges with type None,
2595   // because they aren't carrying any contexts. Remove those edges.
2596   for (auto *Clone : Node->Clones) {
2597     removeNoneTypeCalleeEdges(Clone);
2598     if (VerifyNodes)
2599       checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
2600   }
2601   // We should still have some context ids on the original Node.
2602   assert(!Node->ContextIds.empty());
2603 
2604   // Remove any callee edges that ended up with alloc type None after creating
2605   // clones and updating callee edges.
2606   removeNoneTypeCalleeEdges(Node);
2607 
2608   // Sanity check that no alloc types on node or edges are None.
2609   assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2610   assert(llvm::none_of(Node->CalleeEdges,
2611                        [&](const std::shared_ptr<ContextEdge> &E) {
2612                          return E->AllocTypes == (uint8_t)AllocationType::None;
2613                        }));
2614   assert(llvm::none_of(Node->CallerEdges,
2615                        [&](const std::shared_ptr<ContextEdge> &E) {
2616                          return E->AllocTypes == (uint8_t)AllocationType::None;
2617                        }));
2618 
2619   if (VerifyNodes)
2620     checkNode<DerivedCCG, FuncTy, CallTy>(Node);
2621 }
2622 
2623 void ModuleCallsiteContextGraph::updateAllocationCall(
2624     CallInfo &Call, AllocationType AllocType) {
2625   std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
2626   auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
2627                                 "memprof", AllocTypeString);
2628   cast<CallBase>(Call.call())->addFnAttr(A);
2629   OREGetter(Call.call()->getFunction())
2630       .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
2631             << ore::NV("AllocationCall", Call.call()) << " in clone "
2632             << ore::NV("Caller", Call.call()->getFunction())
2633             << " marked with memprof allocation attribute "
2634             << ore::NV("Attribute", AllocTypeString));
2635 }
2636 
2637 void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
2638                                                      AllocationType AllocType) {
2639   auto *AI = Call.call().dyn_cast<AllocInfo *>();
2640   assert(AI);
2641   assert(AI->Versions.size() > Call.cloneNo());
2642   AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
2643 }
2644 
2645 void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2646                                             FuncInfo CalleeFunc) {
2647   if (CalleeFunc.cloneNo() > 0)
2648     cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
2649   OREGetter(CallerCall.call()->getFunction())
2650       .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
2651             << ore::NV("Call", CallerCall.call()) << " in clone "
2652             << ore::NV("Caller", CallerCall.call()->getFunction())
2653             << " assigned to call function clone "
2654             << ore::NV("Callee", CalleeFunc.func()));
2655 }
2656 
2657 void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2658                                            FuncInfo CalleeFunc) {
2659   auto *CI = CallerCall.call().dyn_cast<CallsiteInfo *>();
2660   assert(CI &&
2661          "Caller cannot be an allocation which should not have profiled calls");
2662   assert(CI->Clones.size() > CallerCall.cloneNo());
2663   CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
2664 }
2665 
2666 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
2667                      Instruction *>::FuncInfo
2668 ModuleCallsiteContextGraph::cloneFunctionForCallsite(
2669     FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2670     std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2671   // Use existing LLVM facilities for cloning and obtaining Call in clone
2672   ValueToValueMapTy VMap;
2673   auto *NewFunc = CloneFunction(Func.func(), VMap);
2674   std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
2675   assert(!Func.func()->getParent()->getFunction(Name));
2676   NewFunc->setName(Name);
2677   for (auto &Inst : CallsWithMetadataInFunc) {
2678     // This map always has the initial version in it.
2679     assert(Inst.cloneNo() == 0);
2680     CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
2681   }
2682   OREGetter(Func.func())
2683       .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
2684             << "created clone " << ore::NV("NewFunction", NewFunc));
2685   return {NewFunc, CloneNo};
2686 }
2687 
2688 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
2689                      IndexCall>::FuncInfo
2690 IndexCallsiteContextGraph::cloneFunctionForCallsite(
2691     FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2692     std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2693   // Check how many clones we have of Call (and therefore function).
2694   // The next clone number is the current size of versions array.
2695   // Confirm this matches the CloneNo provided by the caller, which is based on
2696   // the number of function clones we have.
2697   assert(CloneNo ==
2698          (Call.call().is<AllocInfo *>()
2699               ? Call.call().dyn_cast<AllocInfo *>()->Versions.size()
2700               : Call.call().dyn_cast<CallsiteInfo *>()->Clones.size()));
2701   // Walk all the instructions in this function. Create a new version for
2702   // each (by adding an entry to the Versions/Clones summary array), and copy
2703   // over the version being called for the function clone being cloned here.
2704   // Additionally, add an entry to the CallMap for the new function clone,
2705   // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
2706   // to the new call clone.
2707   for (auto &Inst : CallsWithMetadataInFunc) {
2708     // This map always has the initial version in it.
2709     assert(Inst.cloneNo() == 0);
2710     if (auto *AI = Inst.call().dyn_cast<AllocInfo *>()) {
2711       assert(AI->Versions.size() == CloneNo);
2712       // We assign the allocation type later (in updateAllocationCall), just add
2713       // an entry for it here.
2714       AI->Versions.push_back(0);
2715     } else {
2716       auto *CI = Inst.call().dyn_cast<CallsiteInfo *>();
2717       assert(CI && CI->Clones.size() == CloneNo);
2718       // We assign the clone number later (in updateCall), just add an entry for
2719       // it here.
2720       CI->Clones.push_back(0);
2721     }
2722     CallMap[Inst] = {Inst.call(), CloneNo};
2723   }
2724   return {Func.func(), CloneNo};
2725 }
2726 
2727 // This method assigns cloned callsites to functions, cloning the functions as
2728 // needed. The assignment is greedy and proceeds roughly as follows:
2729 //
2730 // For each function Func:
2731 //   For each call with graph Node having clones:
2732 //     Initialize ClonesWorklist to Node and its clones
2733 //     Initialize NodeCloneCount to 0
2734 //     While ClonesWorklist is not empty:
2735 //        Clone = pop front ClonesWorklist
2736 //        NodeCloneCount++
2737 //        If Func has been cloned less than NodeCloneCount times:
2738 //           If NodeCloneCount is 1:
2739 //             Assign Clone to original Func
2740 //             Continue
2741 //           Create a new function clone
2742 //           If other callers not assigned to call a function clone yet:
2743 //              Assign them to call new function clone
2744 //              Continue
2745 //           Assign any other caller calling the cloned version to new clone
2746 //
2747 //        For each caller of Clone:
2748 //           If caller is assigned to call a specific function clone:
2749 //             If we cannot assign Clone to that function clone:
2750 //               Create new callsite Clone NewClone
2751 //               Add NewClone to ClonesWorklist
2752 //               Continue
2753 //             Assign Clone to existing caller's called function clone
2754 //           Else:
2755 //             If Clone not already assigned to a function clone:
2756 //                Assign to first function clone without assignment
2757 //             Assign caller to selected function clone
2758 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2759 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
2760   bool Changed = false;
2761 
2762   // Keep track of the assignment of nodes (callsites) to function clones they
2763   // call.
2764   DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
2765 
2766   // Update caller node to call function version CalleeFunc, by recording the
2767   // assignment in CallsiteToCalleeFuncCloneMap.
2768   auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
2769                                         const FuncInfo &CalleeFunc) {
2770     assert(Caller->hasCall());
2771     CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
2772   };
2773 
2774   // Walk all functions for which we saw calls with memprof metadata, and handle
2775   // cloning for each of its calls.
2776   for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
2777     FuncInfo OrigFunc(Func);
2778     // Map from each clone of OrigFunc to a map of remappings of each call of
2779     // interest (from original uncloned call to the corresponding cloned call in
2780     // that function clone).
2781     std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
2782     for (auto &Call : CallsWithMetadata) {
2783       ContextNode *Node = getNodeForInst(Call);
2784       // Skip call if we do not have a node for it (all uses of its stack ids
2785       // were either on inlined chains or pruned from the MIBs), or if we did
2786       // not create any clones for it.
2787       if (!Node || Node->Clones.empty())
2788         continue;
2789       assert(Node->hasCall() &&
2790              "Not having a call should have prevented cloning");
2791 
2792       // Track the assignment of function clones to clones of the current
2793       // callsite Node being handled.
2794       std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
2795 
2796       // Assign callsite version CallsiteClone to function version FuncClone,
2797       // and also assign (possibly cloned) Call to CallsiteClone.
2798       auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
2799                                                 CallInfo &Call,
2800                                                 ContextNode *CallsiteClone,
2801                                                 bool IsAlloc) {
2802         // Record the clone of callsite node assigned to this function clone.
2803         FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
2804 
2805         assert(FuncClonesToCallMap.count(FuncClone));
2806         std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
2807         CallInfo CallClone(Call);
2808         if (CallMap.count(Call))
2809           CallClone = CallMap[Call];
2810         CallsiteClone->setCall(CallClone);
2811       };
2812 
2813       // Keep track of the clones of callsite Node that need to be assigned to
2814       // function clones. This list may be expanded in the loop body below if we
2815       // find additional cloning is required.
2816       std::deque<ContextNode *> ClonesWorklist;
2817       // Ignore original Node if we moved all of its contexts to clones.
2818       if (!Node->ContextIds.empty())
2819         ClonesWorklist.push_back(Node);
2820       ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(),
2821                             Node->Clones.end());
2822 
2823       // Now walk through all of the clones of this callsite Node that we need,
2824       // and determine the assignment to a corresponding clone of the current
2825       // function (creating new function clones as needed).
2826       unsigned NodeCloneCount = 0;
2827       while (!ClonesWorklist.empty()) {
2828         ContextNode *Clone = ClonesWorklist.front();
2829         ClonesWorklist.pop_front();
2830         NodeCloneCount++;
2831         if (VerifyNodes)
2832           checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
2833 
2834         // Need to create a new function clone if we have more callsite clones
2835         // than existing function clones, which would have been assigned to an
2836         // earlier clone in the list (we assign callsite clones to function
2837         // clones greedily).
2838         if (FuncClonesToCallMap.size() < NodeCloneCount) {
2839           // If this is the first callsite copy, assign to original function.
2840           if (NodeCloneCount == 1) {
2841             // Since FuncClonesToCallMap is empty in this case, no clones have
2842             // been created for this function yet, and no callers should have
2843             // been assigned a function clone for this callee node yet.
2844             assert(llvm::none_of(
2845                 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
2846                   return CallsiteToCalleeFuncCloneMap.count(E->Caller);
2847                 }));
2848             // Initialize with empty call map, assign Clone to original function
2849             // and its callers, and skip to the next clone.
2850             FuncClonesToCallMap[OrigFunc] = {};
2851             AssignCallsiteCloneToFuncClone(
2852                 OrigFunc, Call, Clone,
2853                 AllocationCallToContextNodeMap.count(Call));
2854             for (auto &CE : Clone->CallerEdges) {
2855               // Ignore any caller that does not have a recorded callsite Call.
2856               if (!CE->Caller->hasCall())
2857                 continue;
2858               RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
2859             }
2860             continue;
2861           }
2862 
2863           // First locate which copy of OrigFunc to clone again. If a caller
2864           // of this callsite clone was already assigned to call a particular
2865           // function clone, we need to redirect all of those callers to the
2866           // new function clone, and update their other callees within this
2867           // function.
2868           FuncInfo PreviousAssignedFuncClone;
2869           auto EI = llvm::find_if(
2870               Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
2871                 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
2872               });
2873           bool CallerAssignedToCloneOfFunc = false;
2874           if (EI != Clone->CallerEdges.end()) {
2875             const std::shared_ptr<ContextEdge> &Edge = *EI;
2876             PreviousAssignedFuncClone =
2877                 CallsiteToCalleeFuncCloneMap[Edge->Caller];
2878             CallerAssignedToCloneOfFunc = true;
2879           }
2880 
2881           // Clone function and save it along with the CallInfo map created
2882           // during cloning in the FuncClonesToCallMap.
2883           std::map<CallInfo, CallInfo> NewCallMap;
2884           unsigned CloneNo = FuncClonesToCallMap.size();
2885           assert(CloneNo > 0 && "Clone 0 is the original function, which "
2886                                 "should already exist in the map");
2887           FuncInfo NewFuncClone = cloneFunctionForCallsite(
2888               OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
2889           FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
2890           FunctionClonesAnalysis++;
2891           Changed = true;
2892 
2893           // If no caller callsites were already assigned to a clone of this
2894           // function, we can simply assign this clone to the new func clone
2895           // and update all callers to it, then skip to the next clone.
2896           if (!CallerAssignedToCloneOfFunc) {
2897             AssignCallsiteCloneToFuncClone(
2898                 NewFuncClone, Call, Clone,
2899                 AllocationCallToContextNodeMap.count(Call));
2900             for (auto &CE : Clone->CallerEdges) {
2901               // Ignore any caller that does not have a recorded callsite Call.
2902               if (!CE->Caller->hasCall())
2903                 continue;
2904               RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
2905             }
2906             continue;
2907           }
2908 
2909           // We may need to do additional node cloning in this case.
2910           // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
2911           // that were previously assigned to call PreviousAssignedFuncClone,
2912           // to record that they now call NewFuncClone.
2913           for (auto CE : Clone->CallerEdges) {
2914             // Skip any that have been removed on an earlier iteration.
2915             if (!CE)
2916               continue;
2917             // Ignore any caller that does not have a recorded callsite Call.
2918             if (!CE->Caller->hasCall())
2919               continue;
2920 
2921             if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
2922                 // We subsequently fall through to later handling that
2923                 // will perform any additional cloning required for
2924                 // callers that were calling other function clones.
2925                 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
2926                     PreviousAssignedFuncClone)
2927               continue;
2928 
2929             RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
2930 
2931             // If we are cloning a function that was already assigned to some
2932             // callers, then essentially we are creating new callsite clones
2933             // of the other callsites in that function that are reached by those
2934             // callers. Clone the other callees of the current callsite's caller
2935             // that were already assigned to PreviousAssignedFuncClone
2936             // accordingly. This is important since we subsequently update the
2937             // calls from the nodes in the graph and their assignments to callee
2938             // functions recorded in CallsiteToCalleeFuncCloneMap.
2939             for (auto CalleeEdge : CE->Caller->CalleeEdges) {
2940               // Skip any that have been removed on an earlier iteration when
2941               // cleaning up newly None type callee edges.
2942               if (!CalleeEdge)
2943                 continue;
2944               ContextNode *Callee = CalleeEdge->Callee;
2945               // Skip the current callsite, we are looking for other
2946               // callsites Caller calls, as well as any that does not have a
2947               // recorded callsite Call.
2948               if (Callee == Clone || !Callee->hasCall())
2949                 continue;
2950               ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge);
2951               removeNoneTypeCalleeEdges(NewClone);
2952               // Moving the edge may have resulted in some none type
2953               // callee edges on the original Callee.
2954               removeNoneTypeCalleeEdges(Callee);
2955               assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
2956               // If the Callee node was already assigned to call a specific
2957               // function version, make sure its new clone is assigned to call
2958               // that same function clone.
2959               if (CallsiteToCalleeFuncCloneMap.count(Callee))
2960                 RecordCalleeFuncOfCallsite(
2961                     NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
2962               // Update NewClone with the new Call clone of this callsite's Call
2963               // created for the new function clone created earlier.
2964               // Recall that we have already ensured when building the graph
2965               // that each caller can only call callsites within the same
2966               // function, so we are guaranteed that Callee Call is in the
2967               // current OrigFunc.
2968               // CallMap is set up as indexed by original Call at clone 0.
2969               CallInfo OrigCall(Callee->getOrigNode()->Call);
2970               OrigCall.setCloneNo(0);
2971               std::map<CallInfo, CallInfo> &CallMap =
2972                   FuncClonesToCallMap[NewFuncClone];
2973               assert(CallMap.count(OrigCall));
2974               CallInfo NewCall(CallMap[OrigCall]);
2975               assert(NewCall);
2976               NewClone->setCall(NewCall);
2977             }
2978           }
2979           // Fall through to handling below to perform the recording of the
2980           // function for this callsite clone. This enables handling of cases
2981           // where the callers were assigned to different clones of a function.
2982         }
2983 
2984         // See if we can use existing function clone. Walk through
2985         // all caller edges to see if any have already been assigned to
2986         // a clone of this callsite's function. If we can use it, do so. If not,
2987         // because that function clone is already assigned to a different clone
2988         // of this callsite, then we need to clone again.
2989         // Basically, this checking is needed to handle the case where different
2990         // caller functions/callsites may need versions of this function
2991         // containing different mixes of callsite clones across the different
2992         // callsites within the function. If that happens, we need to create
2993         // additional function clones to handle the various combinations.
2994         //
2995         // Keep track of any new clones of this callsite created by the
2996         // following loop, as well as any existing clone that we decided to
2997         // assign this clone to.
2998         std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
2999         FuncInfo FuncCloneAssignedToCurCallsiteClone;
3000         // We need to be able to remove Edge from CallerEdges, so need to adjust
3001         // iterator in the loop.
3002         for (auto EI = Clone->CallerEdges.begin();
3003              EI != Clone->CallerEdges.end();) {
3004           auto Edge = *EI;
3005           // Ignore any caller that does not have a recorded callsite Call.
3006           if (!Edge->Caller->hasCall()) {
3007             EI++;
3008             continue;
3009           }
3010           // If this caller already assigned to call a version of OrigFunc, need
3011           // to ensure we can assign this callsite clone to that function clone.
3012           if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
3013             FuncInfo FuncCloneCalledByCaller =
3014                 CallsiteToCalleeFuncCloneMap[Edge->Caller];
3015             // First we need to confirm that this function clone is available
3016             // for use by this callsite node clone.
3017             //
3018             // While FuncCloneToCurNodeCloneMap is built only for this Node and
3019             // its callsite clones, one of those callsite clones X could have
3020             // been assigned to the same function clone called by Edge's caller
3021             // - if Edge's caller calls another callsite within Node's original
3022             // function, and that callsite has another caller reaching clone X.
3023             // We need to clone Node again in this case.
3024             if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
3025                  FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
3026                      Clone) ||
3027                 // Detect when we have multiple callers of this callsite that
3028                 // have already been assigned to specific, and different, clones
3029                 // of OrigFunc (due to other unrelated callsites in Func they
3030                 // reach via call contexts). Is this Clone of callsite Node
3031                 // assigned to a different clone of OrigFunc? If so, clone Node
3032                 // again.
3033                 (FuncCloneAssignedToCurCallsiteClone &&
3034                  FuncCloneAssignedToCurCallsiteClone !=
3035                      FuncCloneCalledByCaller)) {
3036               // We need to use a different newly created callsite clone, in
3037               // order to assign it to another new function clone on a
3038               // subsequent iteration over the Clones array (adjusted below).
3039               // Note we specifically do not reset the
3040               // CallsiteToCalleeFuncCloneMap entry for this caller, so that
3041               // when this new clone is processed later we know which version of
3042               // the function to copy (so that other callsite clones we have
3043               // assigned to that function clone are properly cloned over). See
3044               // comments in the function cloning handling earlier.
3045 
3046               // Check if we already have cloned this callsite again while
3047               // walking through caller edges, for a caller calling the same
3048               // function clone. If so, we can move this edge to that new clone
3049               // rather than creating yet another new clone.
3050               if (FuncCloneToNewCallsiteCloneMap.count(
3051                       FuncCloneCalledByCaller)) {
3052                 ContextNode *NewClone =
3053                     FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
3054                 moveEdgeToExistingCalleeClone(Edge, NewClone, &EI);
3055                 // Cleanup any none type edges cloned over.
3056                 removeNoneTypeCalleeEdges(NewClone);
3057               } else {
3058                 // Create a new callsite clone.
3059                 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI);
3060                 removeNoneTypeCalleeEdges(NewClone);
3061                 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
3062                     NewClone;
3063                 // Add to list of clones and process later.
3064                 ClonesWorklist.push_back(NewClone);
3065                 assert(EI == Clone->CallerEdges.end() ||
3066                        Clone->AllocTypes != (uint8_t)AllocationType::None);
3067                 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
3068               }
3069               // Moving the caller edge may have resulted in some none type
3070               // callee edges.
3071               removeNoneTypeCalleeEdges(Clone);
3072               // We will handle the newly created callsite clone in a subsequent
3073               // iteration over this Node's Clones. Continue here since we
3074               // already adjusted iterator EI while moving the edge.
3075               continue;
3076             }
3077 
3078             // Otherwise, we can use the function clone already assigned to this
3079             // caller.
3080             if (!FuncCloneAssignedToCurCallsiteClone) {
3081               FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
3082               // Assign Clone to FuncCloneCalledByCaller
3083               AssignCallsiteCloneToFuncClone(
3084                   FuncCloneCalledByCaller, Call, Clone,
3085                   AllocationCallToContextNodeMap.count(Call));
3086             } else
3087               // Don't need to do anything - callsite is already calling this
3088               // function clone.
3089               assert(FuncCloneAssignedToCurCallsiteClone ==
3090                      FuncCloneCalledByCaller);
3091 
3092           } else {
3093             // We have not already assigned this caller to a version of
3094             // OrigFunc. Do the assignment now.
3095 
3096             // First check if we have already assigned this callsite clone to a
3097             // clone of OrigFunc for another caller during this iteration over
3098             // its caller edges.
3099             if (!FuncCloneAssignedToCurCallsiteClone) {
3100               // Find first function in FuncClonesToCallMap without an assigned
3101               // clone of this callsite Node. We should always have one
3102               // available at this point due to the earlier cloning when the
3103               // FuncClonesToCallMap size was smaller than the clone number.
3104               for (auto &CF : FuncClonesToCallMap) {
3105                 if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
3106                   FuncCloneAssignedToCurCallsiteClone = CF.first;
3107                   break;
3108                 }
3109               }
3110               assert(FuncCloneAssignedToCurCallsiteClone);
3111               // Assign Clone to FuncCloneAssignedToCurCallsiteClone
3112               AssignCallsiteCloneToFuncClone(
3113                   FuncCloneAssignedToCurCallsiteClone, Call, Clone,
3114                   AllocationCallToContextNodeMap.count(Call));
3115             } else
3116               assert(FuncCloneToCurNodeCloneMap
3117                          [FuncCloneAssignedToCurCallsiteClone] == Clone);
3118             // Update callers to record function version called.
3119             RecordCalleeFuncOfCallsite(Edge->Caller,
3120                                        FuncCloneAssignedToCurCallsiteClone);
3121           }
3122 
3123           EI++;
3124         }
3125       }
3126       if (VerifyCCG) {
3127         checkNode<DerivedCCG, FuncTy, CallTy>(Node);
3128         for (const auto &PE : Node->CalleeEdges)
3129           checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
3130         for (const auto &CE : Node->CallerEdges)
3131           checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
3132         for (auto *Clone : Node->Clones) {
3133           checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
3134           for (const auto &PE : Clone->CalleeEdges)
3135             checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
3136           for (const auto &CE : Clone->CallerEdges)
3137             checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
3138         }
3139       }
3140     }
3141   }
3142 
3143   auto UpdateCalls = [&](ContextNode *Node,
3144                          DenseSet<const ContextNode *> &Visited,
3145                          auto &&UpdateCalls) {
3146     auto Inserted = Visited.insert(Node);
3147     if (!Inserted.second)
3148       return;
3149 
3150     for (auto *Clone : Node->Clones)
3151       UpdateCalls(Clone, Visited, UpdateCalls);
3152 
3153     for (auto &Edge : Node->CallerEdges)
3154       UpdateCalls(Edge->Caller, Visited, UpdateCalls);
3155 
3156     // Skip if either no call to update, or if we ended up with no context ids
3157     // (we moved all edges onto other clones).
3158     if (!Node->hasCall() || Node->ContextIds.empty())
3159       return;
3160 
3161     if (Node->IsAllocation) {
3162       updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes));
3163       return;
3164     }
3165 
3166     if (!CallsiteToCalleeFuncCloneMap.count(Node))
3167       return;
3168 
3169     auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
3170     updateCall(Node->Call, CalleeFunc);
3171   };
3172 
3173   // Performs DFS traversal starting from allocation nodes to update calls to
3174   // reflect cloning decisions recorded earlier. For regular LTO this will
3175   // update the actual calls in the IR to call the appropriate function clone
3176   // (and add attributes to allocation calls), whereas for ThinLTO the decisions
3177   // are recorded in the summary entries.
3178   DenseSet<const ContextNode *> Visited;
3179   for (auto &Entry : AllocationCallToContextNodeMap)
3180     UpdateCalls(Entry.second, Visited, UpdateCalls);
3181 
3182   return Changed;
3183 }
3184 
3185 static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones(
3186     Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
3187     std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
3188         &FuncToAliasMap) {
3189   // The first "clone" is the original copy, we should only call this if we
3190   // needed to create new clones.
3191   assert(NumClones > 1);
3192   SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
3193   VMaps.reserve(NumClones - 1);
3194   FunctionsClonedThinBackend++;
3195   for (unsigned I = 1; I < NumClones; I++) {
3196     VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
3197     auto *NewF = CloneFunction(&F, *VMaps.back());
3198     FunctionClonesThinBackend++;
3199     // Strip memprof and callsite metadata from clone as they are no longer
3200     // needed.
3201     for (auto &BB : *NewF) {
3202       for (auto &Inst : BB) {
3203         Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
3204         Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
3205       }
3206     }
3207     std::string Name = getMemProfFuncName(F.getName(), I);
3208     auto *PrevF = M.getFunction(Name);
3209     if (PrevF) {
3210       // We might have created this when adjusting callsite in another
3211       // function. It should be a declaration.
3212       assert(PrevF->isDeclaration());
3213       NewF->takeName(PrevF);
3214       PrevF->replaceAllUsesWith(NewF);
3215       PrevF->eraseFromParent();
3216     } else
3217       NewF->setName(Name);
3218     ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
3219              << "created clone " << ore::NV("NewFunction", NewF));
3220 
3221     // Now handle aliases to this function, and clone those as well.
3222     if (!FuncToAliasMap.count(&F))
3223       continue;
3224     for (auto *A : FuncToAliasMap[&F]) {
3225       std::string Name = getMemProfFuncName(A->getName(), I);
3226       auto *PrevA = M.getNamedAlias(Name);
3227       auto *NewA = GlobalAlias::create(A->getValueType(),
3228                                        A->getType()->getPointerAddressSpace(),
3229                                        A->getLinkage(), Name, NewF);
3230       NewA->copyAttributesFrom(A);
3231       if (PrevA) {
3232         // We might have created this when adjusting callsite in another
3233         // function. It should be a declaration.
3234         assert(PrevA->isDeclaration());
3235         NewA->takeName(PrevA);
3236         PrevA->replaceAllUsesWith(NewA);
3237         PrevA->eraseFromParent();
3238       }
3239     }
3240   }
3241   return VMaps;
3242 }
3243 
3244 // Locate the summary for F. This is complicated by the fact that it might
3245 // have been internalized or promoted.
3246 static ValueInfo findValueInfoForFunc(const Function &F, const Module &M,
3247                                       const ModuleSummaryIndex *ImportSummary) {
3248   // FIXME: Ideally we would retain the original GUID in some fashion on the
3249   // function (e.g. as metadata), but for now do our best to locate the
3250   // summary without that information.
3251   ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
3252   if (!TheFnVI)
3253     // See if theFn was internalized, by checking index directly with
3254     // original name (this avoids the name adjustment done by getGUID() for
3255     // internal symbols).
3256     TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName()));
3257   if (TheFnVI)
3258     return TheFnVI;
3259   // Now query with the original name before any promotion was performed.
3260   StringRef OrigName =
3261       ModuleSummaryIndex::getOriginalNameBeforePromote(F.getName());
3262   std::string OrigId = GlobalValue::getGlobalIdentifier(
3263       OrigName, GlobalValue::InternalLinkage, M.getSourceFileName());
3264   TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId));
3265   if (TheFnVI)
3266     return TheFnVI;
3267   // Could be a promoted local imported from another module. We need to pass
3268   // down more info here to find the original module id. For now, try with
3269   // the OrigName which might have been stored in the OidGuidMap in the
3270   // index. This would not work if there were same-named locals in multiple
3271   // modules, however.
3272   auto OrigGUID =
3273       ImportSummary->getGUIDFromOriginalID(GlobalValue::getGUID(OrigName));
3274   if (OrigGUID)
3275     TheFnVI = ImportSummary->getValueInfo(OrigGUID);
3276   return TheFnVI;
3277 }
3278 
3279 bool MemProfContextDisambiguation::applyImport(Module &M) {
3280   assert(ImportSummary);
3281   bool Changed = false;
3282 
3283   auto IsMemProfClone = [](const Function &F) {
3284     return F.getName().contains(MemProfCloneSuffix);
3285   };
3286 
3287   // We also need to clone any aliases that reference cloned functions, because
3288   // the modified callsites may invoke via the alias. Keep track of the aliases
3289   // for each function.
3290   std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
3291       FuncToAliasMap;
3292   for (auto &A : M.aliases()) {
3293     auto *Aliasee = A.getAliaseeObject();
3294     if (auto *F = dyn_cast<Function>(Aliasee))
3295       FuncToAliasMap[F].insert(&A);
3296   }
3297 
3298   for (auto &F : M) {
3299     if (F.isDeclaration() || IsMemProfClone(F))
3300       continue;
3301 
3302     OptimizationRemarkEmitter ORE(&F);
3303 
3304     SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
3305     bool ClonesCreated = false;
3306     unsigned NumClonesCreated = 0;
3307     auto CloneFuncIfNeeded = [&](unsigned NumClones) {
3308       // We should at least have version 0 which is the original copy.
3309       assert(NumClones > 0);
3310       // If only one copy needed use original.
3311       if (NumClones == 1)
3312         return;
3313       // If we already performed cloning of this function, confirm that the
3314       // requested number of clones matches (the thin link should ensure the
3315       // number of clones for each constituent callsite is consistent within
3316       // each function), before returning.
3317       if (ClonesCreated) {
3318         assert(NumClonesCreated == NumClones);
3319         return;
3320       }
3321       VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
3322       // The first "clone" is the original copy, which doesn't have a VMap.
3323       assert(VMaps.size() == NumClones - 1);
3324       Changed = true;
3325       ClonesCreated = true;
3326       NumClonesCreated = NumClones;
3327     };
3328 
3329     auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
3330                              Function *CalledFunction) {
3331       // Perform cloning if not yet done.
3332       CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
3333 
3334       // Should have skipped indirect calls via mayHaveMemprofSummary.
3335       assert(CalledFunction);
3336       assert(!IsMemProfClone(*CalledFunction));
3337 
3338       // Update the calls per the summary info.
3339       // Save orig name since it gets updated in the first iteration
3340       // below.
3341       auto CalleeOrigName = CalledFunction->getName();
3342       for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
3343         // Do nothing if this version calls the original version of its
3344         // callee.
3345         if (!StackNode.Clones[J])
3346           continue;
3347         auto NewF = M.getOrInsertFunction(
3348             getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
3349             CalledFunction->getFunctionType());
3350         CallBase *CBClone;
3351         // Copy 0 is the original function.
3352         if (!J)
3353           CBClone = CB;
3354         else
3355           CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
3356         CBClone->setCalledFunction(NewF);
3357         ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
3358                  << ore::NV("Call", CBClone) << " in clone "
3359                  << ore::NV("Caller", CBClone->getFunction())
3360                  << " assigned to call function clone "
3361                  << ore::NV("Callee", NewF.getCallee()));
3362       }
3363     };
3364 
3365     // Locate the summary for F.
3366     ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
3367     // If not found, this could be an imported local (see comment in
3368     // findValueInfoForFunc). Skip for now as it will be cloned in its original
3369     // module (where it would have been promoted to global scope so should
3370     // satisfy any reference in this module).
3371     if (!TheFnVI)
3372       continue;
3373 
3374     auto *GVSummary =
3375         ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
3376     if (!GVSummary)
3377       // Must have been imported, use the first summary (might be multiple if
3378       // this was a linkonce_odr).
3379       GVSummary = TheFnVI.getSummaryList().front().get();
3380 
3381     // If this was an imported alias skip it as we won't have the function
3382     // summary, and it should be cloned in the original module.
3383     if (isa<AliasSummary>(GVSummary))
3384       continue;
3385 
3386     auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
3387 
3388     if (FS->allocs().empty() && FS->callsites().empty())
3389       continue;
3390 
3391     auto SI = FS->callsites().begin();
3392     auto AI = FS->allocs().begin();
3393 
3394     // To handle callsite infos synthesized for tail calls which have missing
3395     // frames in the profiled context, map callee VI to the synthesized callsite
3396     // info.
3397     DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
3398     // Iterate the callsites for this function in reverse, since we place all
3399     // those synthesized for tail calls at the end.
3400     for (auto CallsiteIt = FS->callsites().rbegin();
3401          CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
3402       auto &Callsite = *CallsiteIt;
3403       // Stop as soon as we see a non-synthesized callsite info (see comment
3404       // above loop). All the entries added for discovered tail calls have empty
3405       // stack ids.
3406       if (!Callsite.StackIdIndices.empty())
3407         break;
3408       MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
3409     }
3410 
3411     // Assume for now that the instructions are in the exact same order
3412     // as when the summary was created, but confirm this is correct by
3413     // matching the stack ids.
3414     for (auto &BB : F) {
3415       for (auto &I : BB) {
3416         auto *CB = dyn_cast<CallBase>(&I);
3417         // Same handling as when creating module summary.
3418         if (!mayHaveMemprofSummary(CB))
3419           continue;
3420 
3421         auto *CalledValue = CB->getCalledOperand();
3422         auto *CalledFunction = CB->getCalledFunction();
3423         if (CalledValue && !CalledFunction) {
3424           CalledValue = CalledValue->stripPointerCasts();
3425           // Stripping pointer casts can reveal a called function.
3426           CalledFunction = dyn_cast<Function>(CalledValue);
3427         }
3428         // Check if this is an alias to a function. If so, get the
3429         // called aliasee for the checks below.
3430         if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
3431           assert(!CalledFunction &&
3432                  "Expected null called function in callsite for alias");
3433           CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
3434         }
3435 
3436         CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
3437             I.getMetadata(LLVMContext::MD_callsite));
3438         auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
3439 
3440         // Include allocs that were already assigned a memprof function
3441         // attribute in the statistics.
3442         if (CB->getAttributes().hasFnAttr("memprof")) {
3443           assert(!MemProfMD);
3444           CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
3445               ? AllocTypeColdThinBackend++
3446               : AllocTypeNotColdThinBackend++;
3447           OrigAllocsThinBackend++;
3448           AllocVersionsThinBackend++;
3449           if (!MaxAllocVersionsThinBackend)
3450             MaxAllocVersionsThinBackend = 1;
3451           // Remove any remaining callsite metadata and we can skip the rest of
3452           // the handling for this instruction, since no cloning needed.
3453           I.setMetadata(LLVMContext::MD_callsite, nullptr);
3454           continue;
3455         }
3456 
3457         if (MemProfMD) {
3458           // Consult the next alloc node.
3459           assert(AI != FS->allocs().end());
3460           auto &AllocNode = *(AI++);
3461 
3462           // Sanity check that the MIB stack ids match between the summary and
3463           // instruction metadata.
3464           auto MIBIter = AllocNode.MIBs.begin();
3465           for (auto &MDOp : MemProfMD->operands()) {
3466             assert(MIBIter != AllocNode.MIBs.end());
3467             LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter =
3468                 MIBIter->StackIdIndices.begin();
3469             auto *MIBMD = cast<const MDNode>(MDOp);
3470             MDNode *StackMDNode = getMIBStackNode(MIBMD);
3471             assert(StackMDNode);
3472             SmallVector<unsigned> StackIdsFromMetadata;
3473             CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
3474             for (auto ContextIter =
3475                      StackContext.beginAfterSharedPrefix(CallsiteContext);
3476                  ContextIter != StackContext.end(); ++ContextIter) {
3477               // If this is a direct recursion, simply skip the duplicate
3478               // entries, to be consistent with how the summary ids were
3479               // generated during ModuleSummaryAnalysis.
3480               if (!StackIdsFromMetadata.empty() &&
3481                   StackIdsFromMetadata.back() == *ContextIter)
3482                 continue;
3483               assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
3484               assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3485                      *ContextIter);
3486               StackIdIndexIter++;
3487             }
3488             MIBIter++;
3489           }
3490 
3491           // Perform cloning if not yet done.
3492           CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
3493 
3494           OrigAllocsThinBackend++;
3495           AllocVersionsThinBackend += AllocNode.Versions.size();
3496           if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
3497             MaxAllocVersionsThinBackend = AllocNode.Versions.size();
3498 
3499           // If there is only one version that means we didn't end up
3500           // considering this function for cloning, and in that case the alloc
3501           // will still be none type or should have gotten the default NotCold.
3502           // Skip that after calling clone helper since that does some sanity
3503           // checks that confirm we haven't decided yet that we need cloning.
3504           if (AllocNode.Versions.size() == 1) {
3505             assert((AllocationType)AllocNode.Versions[0] ==
3506                        AllocationType::NotCold ||
3507                    (AllocationType)AllocNode.Versions[0] ==
3508                        AllocationType::None);
3509             UnclonableAllocsThinBackend++;
3510             continue;
3511           }
3512 
3513           // All versions should have a singular allocation type.
3514           assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
3515             return Type == ((uint8_t)AllocationType::NotCold |
3516                             (uint8_t)AllocationType::Cold);
3517           }));
3518 
3519           // Update the allocation types per the summary info.
3520           for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
3521             // Ignore any that didn't get an assigned allocation type.
3522             if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
3523               continue;
3524             AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
3525             AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
3526                                             : AllocTypeNotColdThinBackend++;
3527             std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
3528             auto A = llvm::Attribute::get(F.getContext(), "memprof",
3529                                           AllocTypeString);
3530             CallBase *CBClone;
3531             // Copy 0 is the original function.
3532             if (!J)
3533               CBClone = CB;
3534             else
3535               // Since VMaps are only created for new clones, we index with
3536               // clone J-1 (J==0 is the original clone and does not have a VMaps
3537               // entry).
3538               CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
3539             CBClone->addFnAttr(A);
3540             ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
3541                      << ore::NV("AllocationCall", CBClone) << " in clone "
3542                      << ore::NV("Caller", CBClone->getFunction())
3543                      << " marked with memprof allocation attribute "
3544                      << ore::NV("Attribute", AllocTypeString));
3545           }
3546         } else if (!CallsiteContext.empty()) {
3547           // Consult the next callsite node.
3548           assert(SI != FS->callsites().end());
3549           auto &StackNode = *(SI++);
3550 
3551 #ifndef NDEBUG
3552           // Sanity check that the stack ids match between the summary and
3553           // instruction metadata.
3554           auto StackIdIndexIter = StackNode.StackIdIndices.begin();
3555           for (auto StackId : CallsiteContext) {
3556             assert(StackIdIndexIter != StackNode.StackIdIndices.end());
3557             assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3558                    StackId);
3559             StackIdIndexIter++;
3560           }
3561 #endif
3562 
3563           CloneCallsite(StackNode, CB, CalledFunction);
3564         } else if (CB->isTailCall()) {
3565           // Locate the synthesized callsite info for the callee VI, if any was
3566           // created, and use that for cloning.
3567           ValueInfo CalleeVI =
3568               findValueInfoForFunc(*CalledFunction, M, ImportSummary);
3569           if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
3570             auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
3571             assert(Callsite != MapTailCallCalleeVIToCallsite.end());
3572             CloneCallsite(Callsite->second, CB, CalledFunction);
3573           }
3574         }
3575         // Memprof and callsite metadata on memory allocations no longer needed.
3576         I.setMetadata(LLVMContext::MD_memprof, nullptr);
3577         I.setMetadata(LLVMContext::MD_callsite, nullptr);
3578       }
3579     }
3580   }
3581 
3582   return Changed;
3583 }
3584 
3585 template <typename DerivedCCG, typename FuncTy, typename CallTy>
3586 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
3587   if (DumpCCG) {
3588     dbgs() << "CCG before cloning:\n";
3589     dbgs() << *this;
3590   }
3591   if (ExportToDot)
3592     exportToDot("postbuild");
3593 
3594   if (VerifyCCG) {
3595     check();
3596   }
3597 
3598   identifyClones();
3599 
3600   if (VerifyCCG) {
3601     check();
3602   }
3603 
3604   if (DumpCCG) {
3605     dbgs() << "CCG after cloning:\n";
3606     dbgs() << *this;
3607   }
3608   if (ExportToDot)
3609     exportToDot("cloned");
3610 
3611   bool Changed = assignFunctions();
3612 
3613   if (DumpCCG) {
3614     dbgs() << "CCG after assigning function clones:\n";
3615     dbgs() << *this;
3616   }
3617   if (ExportToDot)
3618     exportToDot("clonefuncassign");
3619 
3620   return Changed;
3621 }
3622 
3623 bool MemProfContextDisambiguation::processModule(
3624     Module &M,
3625     function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
3626 
3627   // If we have an import summary, then the cloning decisions were made during
3628   // the thin link on the index. Apply them and return.
3629   if (ImportSummary)
3630     return applyImport(M);
3631 
3632   // TODO: If/when other types of memprof cloning are enabled beyond just for
3633   // hot and cold, we will need to change this to individually control the
3634   // AllocationType passed to addStackNodesForMIB during CCG construction.
3635   // Note that we specifically check this after applying imports above, so that
3636   // the option isn't needed to be passed to distributed ThinLTO backend
3637   // clang processes, which won't necessarily have visibility into the linker
3638   // dependences. Instead the information is communicated from the LTO link to
3639   // the backends via the combined summary index.
3640   if (!SupportsHotColdNew)
3641     return false;
3642 
3643   ModuleCallsiteContextGraph CCG(M, OREGetter);
3644   return CCG.process();
3645 }
3646 
3647 MemProfContextDisambiguation::MemProfContextDisambiguation(
3648     const ModuleSummaryIndex *Summary)
3649     : ImportSummary(Summary) {
3650   if (ImportSummary) {
3651     // The MemProfImportSummary should only be used for testing ThinLTO
3652     // distributed backend handling via opt, in which case we don't have a
3653     // summary from the pass pipeline.
3654     assert(MemProfImportSummary.empty());
3655     return;
3656   }
3657   if (MemProfImportSummary.empty())
3658     return;
3659 
3660   auto ReadSummaryFile =
3661       errorOrToExpected(MemoryBuffer::getFile(MemProfImportSummary));
3662   if (!ReadSummaryFile) {
3663     logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
3664                           "Error loading file '" + MemProfImportSummary +
3665                               "': ");
3666     return;
3667   }
3668   auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
3669   if (!ImportSummaryForTestingOrErr) {
3670     logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
3671                           "Error parsing file '" + MemProfImportSummary +
3672                               "': ");
3673     return;
3674   }
3675   ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
3676   ImportSummary = ImportSummaryForTesting.get();
3677 }
3678 
3679 PreservedAnalyses MemProfContextDisambiguation::run(Module &M,
3680                                                     ModuleAnalysisManager &AM) {
3681   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
3682   auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
3683     return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
3684   };
3685   if (!processModule(M, OREGetter))
3686     return PreservedAnalyses::all();
3687   return PreservedAnalyses::none();
3688 }
3689 
3690 void MemProfContextDisambiguation::run(
3691     ModuleSummaryIndex &Index,
3692     function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
3693         isPrevailing) {
3694   // TODO: If/when other types of memprof cloning are enabled beyond just for
3695   // hot and cold, we will need to change this to individually control the
3696   // AllocationType passed to addStackNodesForMIB during CCG construction.
3697   // The index was set from the option, so these should be in sync.
3698   assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
3699   if (!SupportsHotColdNew)
3700     return;
3701 
3702   IndexCallsiteContextGraph CCG(Index, isPrevailing);
3703   CCG.process();
3704 }
3705