xref: /llvm-project/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp (revision 8a86e6aefead228a75107c813c282557425f0987)
1 //==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements support for context disambiguation of allocation
10 // calls for profile guided heap optimization. Specifically, it uses Memprof
11 // profiles which indicate context specific allocation behavior (currently
12 // distinguishing cold vs hot memory allocations). Cloning is performed to
13 // expose the cold allocation call contexts, and the allocation calls are
14 // subsequently annotated with an attribute for later transformation.
15 //
16 // The transformations can be performed either directly on IR (regular LTO), or
17 // on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18 // Both types of LTO operate on a the same base graph representation, which
19 // uses CRTP to support either IR or Index formats.
20 //
21 //===----------------------------------------------------------------------===//
22 
23 #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
24 #include "llvm/ADT/DenseMap.h"
25 #include "llvm/ADT/DenseSet.h"
26 #include "llvm/ADT/MapVector.h"
27 #include "llvm/ADT/SetOperations.h"
28 #include "llvm/ADT/SmallPtrSet.h"
29 #include "llvm/ADT/SmallSet.h"
30 #include "llvm/ADT/SmallVector.h"
31 #include "llvm/ADT/Statistic.h"
32 #include "llvm/Analysis/MemoryProfileInfo.h"
33 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
34 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
35 #include "llvm/Bitcode/BitcodeReader.h"
36 #include "llvm/IR/Instructions.h"
37 #include "llvm/IR/Module.h"
38 #include "llvm/IR/ModuleSummaryIndex.h"
39 #include "llvm/Pass.h"
40 #include "llvm/Support/CommandLine.h"
41 #include "llvm/Support/GraphWriter.h"
42 #include "llvm/Support/raw_ostream.h"
43 #include "llvm/Transforms/IPO.h"
44 #include "llvm/Transforms/Utils/CallPromotionUtils.h"
45 #include "llvm/Transforms/Utils/Cloning.h"
46 #include "llvm/Transforms/Utils/Instrumentation.h"
47 #include <deque>
48 #include <sstream>
49 #include <unordered_map>
50 #include <vector>
51 using namespace llvm;
52 using namespace llvm::memprof;
53 
54 #define DEBUG_TYPE "memprof-context-disambiguation"
55 
56 STATISTIC(FunctionClonesAnalysis,
57           "Number of function clones created during whole program analysis");
58 STATISTIC(FunctionClonesThinBackend,
59           "Number of function clones created during ThinLTO backend");
60 STATISTIC(FunctionsClonedThinBackend,
61           "Number of functions that had clones created during ThinLTO backend");
62 STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
63                             "cloned) during whole program analysis");
64 STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
65                          "during whole program analysis");
66 STATISTIC(AllocTypeNotColdThinBackend,
67           "Number of not cold static allocations (possibly cloned) during "
68           "ThinLTO backend");
69 STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
70                                     "(possibly cloned) during ThinLTO backend");
71 STATISTIC(OrigAllocsThinBackend,
72           "Number of original (not cloned) allocations with memprof profiles "
73           "during ThinLTO backend");
74 STATISTIC(
75     AllocVersionsThinBackend,
76     "Number of allocation versions (including clones) during ThinLTO backend");
77 STATISTIC(MaxAllocVersionsThinBackend,
78           "Maximum number of allocation versions created for an original "
79           "allocation during ThinLTO backend");
80 STATISTIC(UnclonableAllocsThinBackend,
81           "Number of unclonable ambigous allocations during ThinLTO backend");
82 STATISTIC(RemovedEdgesWithMismatchedCallees,
83           "Number of edges removed due to mismatched callees (profiled vs IR)");
84 STATISTIC(FoundProfiledCalleeCount,
85           "Number of profiled callees found via tail calls");
86 STATISTIC(FoundProfiledCalleeDepth,
87           "Aggregate depth of profiled callees found via tail calls");
88 STATISTIC(FoundProfiledCalleeMaxDepth,
89           "Maximum depth of profiled callees found via tail calls");
90 STATISTIC(FoundProfiledCalleeNonUniquelyCount,
91           "Number of profiled callees found via multiple tail call chains");
92 
93 static cl::opt<std::string> DotFilePathPrefix(
94     "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
95     cl::value_desc("filename"),
96     cl::desc("Specify the path prefix of the MemProf dot files."));
97 
98 static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
99                                  cl::Hidden,
100                                  cl::desc("Export graph to dot files."));
101 
102 static cl::opt<bool>
103     DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
104             cl::desc("Dump CallingContextGraph to stdout after each stage."));
105 
106 static cl::opt<bool>
107     VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
108               cl::desc("Perform verification checks on CallingContextGraph."));
109 
110 static cl::opt<bool>
111     VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
112                 cl::desc("Perform frequent verification checks on nodes."));
113 
114 static cl::opt<std::string> MemProfImportSummary(
115     "memprof-import-summary",
116     cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
117     cl::Hidden);
118 
119 static cl::opt<unsigned>
120     TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
121                         cl::Hidden,
122                         cl::desc("Max depth to recursively search for missing "
123                                  "frames through tail calls."));
124 
125 // Optionally enable cloning of callsites involved with recursive cycles
126 static cl::opt<bool> AllowRecursiveCallsites(
127     "memprof-allow-recursive-callsites", cl::init(false), cl::Hidden,
128     cl::desc("Allow cloning of callsites involved in recursive cycles"));
129 
130 // When disabled, try to detect and prevent cloning of recursive contexts.
131 // This is only necessary until we support cloning through recursive cycles.
132 // Leave on by default for now, as disabling requires a little bit of compile
133 // time overhead and doesn't affect correctness, it will just inflate the cold
134 // hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
135 static cl::opt<bool> AllowRecursiveContexts(
136     "memprof-allow-recursive-contexts", cl::init(true), cl::Hidden,
137     cl::desc("Allow cloning of contexts through recursive cycles"));
138 
139 namespace llvm {
140 cl::opt<bool> EnableMemProfContextDisambiguation(
141     "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
142     cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
143 
144 // Indicate we are linking with an allocator that supports hot/cold operator
145 // new interfaces.
146 cl::opt<bool> SupportsHotColdNew(
147     "supports-hot-cold-new", cl::init(false), cl::Hidden,
148     cl::desc("Linking with hot/cold operator new interfaces"));
149 
150 cl::opt<bool> MemProfRequireDefinitionForPromotion(
151     "memprof-require-definition-for-promotion", cl::init(false), cl::Hidden,
152     cl::desc(
153         "Require target function definition when promoting indirect calls"));
154 } // namespace llvm
155 
156 extern cl::opt<bool> MemProfReportHintedSizes;
157 extern cl::opt<unsigned> MinClonedColdBytePercent;
158 
159 namespace {
160 /// CRTP base for graphs built from either IR or ThinLTO summary index.
161 ///
162 /// The graph represents the call contexts in all memprof metadata on allocation
163 /// calls, with nodes for the allocations themselves, as well as for the calls
164 /// in each context. The graph is initially built from the allocation memprof
165 /// metadata (or summary) MIBs. It is then updated to match calls with callsite
166 /// metadata onto the nodes, updating it to reflect any inlining performed on
167 /// those calls.
168 ///
169 /// Each MIB (representing an allocation's call context with allocation
170 /// behavior) is assigned a unique context id during the graph build. The edges
171 /// and nodes in the graph are decorated with the context ids they carry. This
172 /// is used to correctly update the graph when cloning is performed so that we
173 /// can uniquify the context for a single (possibly cloned) allocation.
174 template <typename DerivedCCG, typename FuncTy, typename CallTy>
175 class CallsiteContextGraph {
176 public:
177   CallsiteContextGraph() = default;
178   CallsiteContextGraph(const CallsiteContextGraph &) = default;
179   CallsiteContextGraph(CallsiteContextGraph &&) = default;
180 
181   /// Main entry point to perform analysis and transformations on graph.
182   bool process();
183 
184   /// Perform cloning on the graph necessary to uniquely identify the allocation
185   /// behavior of an allocation based on its context.
186   void identifyClones();
187 
188   /// Assign callsite clones to functions, cloning functions as needed to
189   /// accommodate the combinations of their callsite clones reached by callers.
190   /// For regular LTO this clones functions and callsites in the IR, but for
191   /// ThinLTO the cloning decisions are noted in the summaries and later applied
192   /// in applyImport.
193   bool assignFunctions();
194 
195   void dump() const;
196   void print(raw_ostream &OS) const;
197   void printTotalSizes(raw_ostream &OS) const;
198 
199   friend raw_ostream &operator<<(raw_ostream &OS,
200                                  const CallsiteContextGraph &CCG) {
201     CCG.print(OS);
202     return OS;
203   }
204 
205   friend struct GraphTraits<
206       const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
207   friend struct DOTGraphTraits<
208       const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
209 
210   void exportToDot(std::string Label) const;
211 
212   /// Represents a function clone via FuncTy pointer and clone number pair.
213   struct FuncInfo final
214       : public std::pair<FuncTy *, unsigned /*Clone number*/> {
215     using Base = std::pair<FuncTy *, unsigned>;
216     FuncInfo(const Base &B) : Base(B) {}
217     FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
218     explicit operator bool() const { return this->first != nullptr; }
219     FuncTy *func() const { return this->first; }
220     unsigned cloneNo() const { return this->second; }
221   };
222 
223   /// Represents a callsite clone via CallTy and clone number pair.
224   struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
225     using Base = std::pair<CallTy, unsigned>;
226     CallInfo(const Base &B) : Base(B) {}
227     CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
228         : Base(Call, CloneNo) {}
229     explicit operator bool() const { return (bool)this->first; }
230     CallTy call() const { return this->first; }
231     unsigned cloneNo() const { return this->second; }
232     void setCloneNo(unsigned N) { this->second = N; }
233     void print(raw_ostream &OS) const {
234       if (!operator bool()) {
235         assert(!cloneNo());
236         OS << "null Call";
237         return;
238       }
239       call()->print(OS);
240       OS << "\t(clone " << cloneNo() << ")";
241     }
242     void dump() const {
243       print(dbgs());
244       dbgs() << "\n";
245     }
246     friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
247       Call.print(OS);
248       return OS;
249     }
250   };
251 
252   struct ContextEdge;
253 
254   /// Node in the Callsite Context Graph
255   struct ContextNode {
256     // Keep this for now since in the IR case where we have an Instruction* it
257     // is not as immediately discoverable. Used for printing richer information
258     // when dumping graph.
259     bool IsAllocation;
260 
261     // Keeps track of when the Call was reset to null because there was
262     // recursion.
263     bool Recursive = false;
264 
265     // This will be formed by ORing together the AllocationType enum values
266     // for contexts including this node.
267     uint8_t AllocTypes = 0;
268 
269     // The corresponding allocation or interior call. This is the primary call
270     // for which we have created this node.
271     CallInfo Call;
272 
273     // List of other calls that can be treated the same as the primary call
274     // through cloning. I.e. located in the same function and have the same
275     // (possibly pruned) stack ids. They will be updated the same way as the
276     // primary call when assigning to function clones.
277     SmallVector<CallInfo, 0> MatchingCalls;
278 
279     // For alloc nodes this is a unique id assigned when constructed, and for
280     // callsite stack nodes it is the original stack id when the node is
281     // constructed from the memprof MIB metadata on the alloc nodes. Note that
282     // this is only used when matching callsite metadata onto the stack nodes
283     // created when processing the allocation memprof MIBs, and for labeling
284     // nodes in the dot graph. Therefore we don't bother to assign a value for
285     // clones.
286     uint64_t OrigStackOrAllocId = 0;
287 
288     // Edges to all callees in the profiled call stacks.
289     // TODO: Should this be a map (from Callee node) for more efficient lookup?
290     std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
291 
292     // Edges to all callers in the profiled call stacks.
293     // TODO: Should this be a map (from Caller node) for more efficient lookup?
294     std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
295 
296     // Get the list of edges from which we can compute allocation information
297     // such as the context ids and allocation type of this node.
298     const std::vector<std::shared_ptr<ContextEdge>> *
299     getEdgesWithAllocInfo() const {
300       // If node has any callees, compute from those, otherwise compute from
301       // callers (i.e. if this is the leaf allocation node).
302       if (!CalleeEdges.empty())
303         return &CalleeEdges;
304       if (!CallerEdges.empty()) {
305         // A node with caller edges but no callee edges must be the allocation
306         // node.
307         assert(IsAllocation);
308         return &CallerEdges;
309       }
310       return nullptr;
311     }
312 
313     // Compute the context ids for this node from the union of its edge context
314     // ids.
315     DenseSet<uint32_t> getContextIds() const {
316       DenseSet<uint32_t> ContextIds;
317       auto *Edges = getEdgesWithAllocInfo();
318       if (!Edges)
319         return {};
320       unsigned Count = 0;
321       for (auto &Edge : *Edges)
322         Count += Edge->getContextIds().size();
323       ContextIds.reserve(Count);
324       for (auto &Edge : *Edges)
325         ContextIds.insert(Edge->getContextIds().begin(),
326                           Edge->getContextIds().end());
327       return ContextIds;
328     }
329 
330     // Compute the allocation type for this node from the OR of its edge
331     // allocation types.
332     uint8_t computeAllocType() const {
333       auto *Edges = getEdgesWithAllocInfo();
334       if (!Edges)
335         return (uint8_t)AllocationType::None;
336       uint8_t BothTypes =
337           (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
338       uint8_t AllocType = (uint8_t)AllocationType::None;
339       for (auto &Edge : *Edges) {
340         AllocType |= Edge->AllocTypes;
341         // Bail early if alloc type reached both, no further refinement.
342         if (AllocType == BothTypes)
343           return AllocType;
344       }
345       return AllocType;
346     }
347 
348     // The context ids set for this node is empty if its edge context ids are
349     // also all empty.
350     bool emptyContextIds() const {
351       auto *Edges = getEdgesWithAllocInfo();
352       if (!Edges)
353         return true;
354       for (auto &Edge : *Edges) {
355         if (!Edge->getContextIds().empty())
356           return false;
357       }
358       return true;
359     }
360 
361     // List of clones of this ContextNode, initially empty.
362     std::vector<ContextNode *> Clones;
363 
364     // If a clone, points to the original uncloned node.
365     ContextNode *CloneOf = nullptr;
366 
367     ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
368 
369     ContextNode(bool IsAllocation, CallInfo C)
370         : IsAllocation(IsAllocation), Call(C) {}
371 
372     void addClone(ContextNode *Clone) {
373       if (CloneOf) {
374         CloneOf->Clones.push_back(Clone);
375         Clone->CloneOf = CloneOf;
376       } else {
377         Clones.push_back(Clone);
378         assert(!Clone->CloneOf);
379         Clone->CloneOf = this;
380       }
381     }
382 
383     ContextNode *getOrigNode() {
384       if (!CloneOf)
385         return this;
386       return CloneOf;
387     }
388 
389     void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
390                                unsigned int ContextId);
391 
392     ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
393     ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
394     void eraseCalleeEdge(const ContextEdge *Edge);
395     void eraseCallerEdge(const ContextEdge *Edge);
396 
397     void setCall(CallInfo C) { Call = C; }
398 
399     bool hasCall() const { return (bool)Call.call(); }
400 
401     void printCall(raw_ostream &OS) const { Call.print(OS); }
402 
403     // True if this node was effectively removed from the graph, in which case
404     // it should have an allocation type of None and empty context ids.
405     bool isRemoved() const {
406       assert((AllocTypes == (uint8_t)AllocationType::None) ==
407              emptyContextIds());
408       return AllocTypes == (uint8_t)AllocationType::None;
409     }
410 
411     void dump() const;
412     void print(raw_ostream &OS) const;
413 
414     friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
415       Node.print(OS);
416       return OS;
417     }
418   };
419 
420   /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
421   /// callee.
422   struct ContextEdge {
423     ContextNode *Callee;
424     ContextNode *Caller;
425 
426     // This will be formed by ORing together the AllocationType enum values
427     // for contexts including this edge.
428     uint8_t AllocTypes = 0;
429 
430     // The set of IDs for contexts including this edge.
431     DenseSet<uint32_t> ContextIds;
432 
433     ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
434                 DenseSet<uint32_t> ContextIds)
435         : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
436           ContextIds(std::move(ContextIds)) {}
437 
438     DenseSet<uint32_t> &getContextIds() { return ContextIds; }
439 
440     // Helper to clear the fields of this edge when we are removing it from the
441     // graph.
442     inline void clear() {
443       ContextIds.clear();
444       AllocTypes = (uint8_t)AllocationType::None;
445       Caller = nullptr;
446       Callee = nullptr;
447     }
448 
449     // Check if edge was removed from the graph. This is useful while iterating
450     // over a copy of edge lists when performing operations that mutate the
451     // graph in ways that might remove one of the edges.
452     inline bool isRemoved() const {
453       if (Callee || Caller)
454         return false;
455       // Any edges that have been removed from the graph but are still in a
456       // shared_ptr somewhere should have all fields null'ed out by clear()
457       // above.
458       assert(AllocTypes == (uint8_t)AllocationType::None);
459       assert(ContextIds.empty());
460       return true;
461     }
462 
463     void dump() const;
464     void print(raw_ostream &OS) const;
465 
466     friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
467       Edge.print(OS);
468       return OS;
469     }
470   };
471 
472   /// Helpers to remove edges that have allocation type None (due to not
473   /// carrying any context ids) after transformations.
474   void removeNoneTypeCalleeEdges(ContextNode *Node);
475   void removeNoneTypeCallerEdges(ContextNode *Node);
476   void
477   recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
478                                        DenseSet<const ContextNode *> &Visited);
479 
480 protected:
481   /// Get a list of nodes corresponding to the stack ids in the given callsite
482   /// context.
483   template <class NodeT, class IteratorT>
484   std::vector<uint64_t>
485   getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
486 
487   /// Adds nodes for the given allocation and any stack ids on its memprof MIB
488   /// metadata (or summary).
489   ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
490 
491   /// Adds nodes for the given MIB stack ids.
492   template <class NodeT, class IteratorT>
493   void addStackNodesForMIB(ContextNode *AllocNode,
494                            CallStack<NodeT, IteratorT> &StackContext,
495                            CallStack<NodeT, IteratorT> &CallsiteContext,
496                            AllocationType AllocType,
497                            ArrayRef<ContextTotalSize> ContextSizeInfo);
498 
499   /// Matches all callsite metadata (or summary) to the nodes created for
500   /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
501   /// inlining performed on those callsite instructions.
502   void updateStackNodes();
503 
504   /// Update graph to conservatively handle any callsite stack nodes that target
505   /// multiple different callee target functions.
506   void handleCallsitesWithMultipleTargets();
507 
508   // Try to partition calls on the given node (already placed into the AllCalls
509   // array) by callee function, creating new copies of Node as needed to hold
510   // calls with different callees, and moving the callee edges appropriately.
511   // Returns true if partitioning was successful.
512   bool partitionCallsByCallee(
513       ContextNode *Node, ArrayRef<CallInfo> AllCalls,
514       std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode);
515 
516   /// Save lists of calls with MemProf metadata in each function, for faster
517   /// iteration.
518   MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
519 
520   /// Map from callsite node to the enclosing caller function.
521   std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
522 
523 private:
524   using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
525 
526   // Structure to keep track of information for each call as we are matching
527   // non-allocation callsites onto context nodes created from the allocation
528   // call metadata / summary contexts.
529   struct CallContextInfo {
530     // The callsite we're trying to match.
531     CallTy Call;
532     // The callsites stack ids that have a context node in the graph.
533     std::vector<uint64_t> StackIds;
534     // The function containing this callsite.
535     const FuncTy *Func;
536     // Initially empty, if needed this will be updated to contain the context
537     // ids for use in a new context node created for this callsite.
538     DenseSet<uint32_t> ContextIds;
539   };
540 
541   /// Helper to remove edge from graph, updating edge iterator if it is provided
542   /// (in which case CalleeIter indicates which edge list is being iterated).
543   /// This will also perform the necessary clearing of the ContextEdge members
544   /// to enable later checking if the edge has been removed (since we may have
545   /// other copies of the shared_ptr in existence, and in fact rely on this to
546   /// enable removal while iterating over a copy of a node's edge list).
547   void removeEdgeFromGraph(ContextEdge *Edge, EdgeIter *EI = nullptr,
548                            bool CalleeIter = true);
549 
550   /// Assigns the given Node to calls at or inlined into the location with
551   /// the Node's stack id, after post order traversing and processing its
552   /// caller nodes. Uses the call information recorded in the given
553   /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
554   /// as needed. Called by updateStackNodes which sets up the given
555   /// StackIdToMatchingCalls map.
556   void assignStackNodesPostOrder(
557       ContextNode *Node, DenseSet<const ContextNode *> &Visited,
558       DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
559       DenseMap<CallInfo, CallInfo> &CallToMatchingCall);
560 
561   /// Duplicates the given set of context ids, updating the provided
562   /// map from each original id with the newly generated context ids,
563   /// and returning the new duplicated id set.
564   DenseSet<uint32_t> duplicateContextIds(
565       const DenseSet<uint32_t> &StackSequenceContextIds,
566       DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
567 
568   /// Propagates all duplicated context ids across the graph.
569   void propagateDuplicateContextIds(
570       const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
571 
572   /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
573   /// else to its callers. Also updates OrigNode's edges to remove any context
574   /// ids moved to the newly created edge.
575   void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
576                       bool TowardsCallee,
577                       DenseSet<uint32_t> RemainingContextIds);
578 
579   /// Get the stack id corresponding to the given Id or Index (for IR this will
580   /// return itself, for a summary index this will return the id recorded in the
581   /// index for that stack id index value).
582   uint64_t getStackId(uint64_t IdOrIndex) const {
583     return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
584   }
585 
586   /// Returns true if the given call targets the callee of the given edge, or if
587   /// we were able to identify the call chain through intermediate tail calls.
588   /// In the latter case new context nodes are added to the graph for the
589   /// identified tail calls, and their synthesized nodes are added to
590   /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
591   /// the updated edges and to prepare it for an increment in the caller.
592   bool
593   calleesMatch(CallTy Call, EdgeIter &EI,
594                MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
595 
596   // Return the callee function of the given call, or nullptr if it can't be
597   // determined
598   const FuncTy *getCalleeFunc(CallTy Call) {
599     return static_cast<DerivedCCG *>(this)->getCalleeFunc(Call);
600   }
601 
602   /// Returns true if the given call targets the given function, or if we were
603   /// able to identify the call chain through intermediate tail calls (in which
604   /// case FoundCalleeChain will be populated).
605   bool calleeMatchesFunc(
606       CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
607       std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
608     return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
609         Call, Func, CallerFunc, FoundCalleeChain);
610   }
611 
612   /// Returns true if both call instructions have the same callee.
613   bool sameCallee(CallTy Call1, CallTy Call2) {
614     return static_cast<DerivedCCG *>(this)->sameCallee(Call1, Call2);
615   }
616 
617   /// Get a list of nodes corresponding to the stack ids in the given
618   /// callsite's context.
619   std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
620     return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
621         Call);
622   }
623 
624   /// Get the last stack id in the context for callsite.
625   uint64_t getLastStackId(CallTy Call) {
626     return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
627   }
628 
629   /// Update the allocation call to record type of allocated memory.
630   void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
631     AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
632     static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
633   }
634 
635   /// Get the AllocationType assigned to the given allocation instruction clone.
636   AllocationType getAllocationCallType(const CallInfo &Call) const {
637     return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call);
638   }
639 
640   /// Update non-allocation call to invoke (possibly cloned) function
641   /// CalleeFunc.
642   void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
643     static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
644   }
645 
646   /// Clone the given function for the given callsite, recording mapping of all
647   /// of the functions tracked calls to their new versions in the CallMap.
648   /// Assigns new clones to clone number CloneNo.
649   FuncInfo cloneFunctionForCallsite(
650       FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
651       std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
652     return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
653         Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
654   }
655 
656   /// Gets a label to use in the dot graph for the given call clone in the given
657   /// function.
658   std::string getLabel(const FuncTy *Func, const CallTy Call,
659                        unsigned CloneNo) const {
660     return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
661   }
662 
663   // Create and return a new ContextNode.
664   ContextNode *createNewNode(bool IsAllocation, const FuncTy *F = nullptr,
665                              CallInfo C = CallInfo()) {
666     NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C));
667     auto *NewNode = NodeOwner.back().get();
668     if (F)
669       NodeToCallingFunc[NewNode] = F;
670     return NewNode;
671   }
672 
673   /// Helpers to find the node corresponding to the given call or stackid.
674   ContextNode *getNodeForInst(const CallInfo &C);
675   ContextNode *getNodeForAlloc(const CallInfo &C);
676   ContextNode *getNodeForStackId(uint64_t StackId);
677 
678   /// Computes the alloc type corresponding to the given context ids, by
679   /// unioning their recorded alloc types.
680   uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds) const;
681 
682   /// Returns the allocation type of the intersection of the contexts of two
683   /// nodes (based on their provided context id sets), optimized for the case
684   /// when Node1Ids is smaller than Node2Ids.
685   uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
686                                   const DenseSet<uint32_t> &Node2Ids) const;
687 
688   /// Returns the allocation type of the intersection of the contexts of two
689   /// nodes (based on their provided context id sets).
690   uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
691                               const DenseSet<uint32_t> &Node2Ids) const;
692 
693   /// Create a clone of Edge's callee and move Edge to that new callee node,
694   /// performing the necessary context id and allocation type updates.
695   /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
696   /// moved to an edge to the new callee.
697   ContextNode *
698   moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
699                            DenseSet<uint32_t> ContextIdsToMove = {});
700 
701   /// Change the callee of Edge to existing callee clone NewCallee, performing
702   /// the necessary context id and allocation type updates.
703   /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
704   /// moved to an edge to the new callee.
705   void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
706                                      ContextNode *NewCallee,
707                                      bool NewClone = false,
708                                      DenseSet<uint32_t> ContextIdsToMove = {});
709 
710   /// Change the caller of the edge at the given callee edge iterator to be
711   /// NewCaller, performing the necessary context id and allocation type
712   /// updates. This is similar to the above moveEdgeToExistingCalleeClone, but
713   /// a simplified version of it as we always move the given edge and all of its
714   /// context ids.
715   void moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
716                                  ContextNode *NewCaller);
717 
718   /// Recursively perform cloning on the graph for the given Node and its
719   /// callers, in order to uniquely identify the allocation behavior of an
720   /// allocation given its context. The context ids of the allocation being
721   /// processed are given in AllocContextIds.
722   void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
723                       const DenseSet<uint32_t> &AllocContextIds);
724 
725   /// Map from each context ID to the AllocationType assigned to that context.
726   DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
727 
728   /// Map from each contextID to the profiled full contexts and their total
729   /// sizes (there may be more than one due to context trimming),
730   /// optionally populated when requested (via MemProfReportHintedSizes or
731   /// MinClonedColdBytePercent).
732   DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
733 
734   /// Identifies the context node created for a stack id when adding the MIB
735   /// contexts to the graph. This is used to locate the context nodes when
736   /// trying to assign the corresponding callsites with those stack ids to these
737   /// nodes.
738   DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
739 
740   /// Maps to track the calls to their corresponding nodes in the graph.
741   MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
742   MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
743 
744   /// Owner of all ContextNode unique_ptrs.
745   std::vector<std::unique_ptr<ContextNode>> NodeOwner;
746 
747   /// Perform sanity checks on graph when requested.
748   void check() const;
749 
750   /// Keeps track of the last unique context id assigned.
751   unsigned int LastContextId = 0;
752 };
753 
754 template <typename DerivedCCG, typename FuncTy, typename CallTy>
755 using ContextNode =
756     typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
757 template <typename DerivedCCG, typename FuncTy, typename CallTy>
758 using ContextEdge =
759     typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
760 template <typename DerivedCCG, typename FuncTy, typename CallTy>
761 using FuncInfo =
762     typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
763 template <typename DerivedCCG, typename FuncTy, typename CallTy>
764 using CallInfo =
765     typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
766 
767 /// CRTP derived class for graphs built from IR (regular LTO).
768 class ModuleCallsiteContextGraph
769     : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
770                                   Instruction *> {
771 public:
772   ModuleCallsiteContextGraph(
773       Module &M,
774       llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
775 
776 private:
777   friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
778                               Instruction *>;
779 
780   uint64_t getStackId(uint64_t IdOrIndex) const;
781   const Function *getCalleeFunc(Instruction *Call);
782   bool calleeMatchesFunc(
783       Instruction *Call, const Function *Func, const Function *CallerFunc,
784       std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
785   bool sameCallee(Instruction *Call1, Instruction *Call2);
786   bool findProfiledCalleeThroughTailCalls(
787       const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
788       std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
789       bool &FoundMultipleCalleeChains);
790   uint64_t getLastStackId(Instruction *Call);
791   std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
792   void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
793   AllocationType getAllocationCallType(const CallInfo &Call) const;
794   void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
795   CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
796                        Instruction *>::FuncInfo
797   cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
798                            std::map<CallInfo, CallInfo> &CallMap,
799                            std::vector<CallInfo> &CallsWithMetadataInFunc,
800                            unsigned CloneNo);
801   std::string getLabel(const Function *Func, const Instruction *Call,
802                        unsigned CloneNo) const;
803 
804   const Module &Mod;
805   llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
806 };
807 
808 /// Represents a call in the summary index graph, which can either be an
809 /// allocation or an interior callsite node in an allocation's context.
810 /// Holds a pointer to the corresponding data structure in the index.
811 struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
812   IndexCall() : PointerUnion() {}
813   IndexCall(std::nullptr_t) : IndexCall() {}
814   IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
815   IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
816   IndexCall(PointerUnion PT) : PointerUnion(PT) {}
817 
818   IndexCall *operator->() { return this; }
819 
820   void print(raw_ostream &OS) const {
821     PointerUnion<CallsiteInfo *, AllocInfo *> Base = *this;
822     if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(Base)) {
823       OS << *AI;
824     } else {
825       auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(Base);
826       assert(CI);
827       OS << *CI;
828     }
829   }
830 };
831 } // namespace
832 
833 namespace llvm {
834 template <> struct simplify_type<IndexCall> {
835   using SimpleType = PointerUnion<CallsiteInfo *, AllocInfo *>;
836   static SimpleType getSimplifiedValue(IndexCall &Val) { return Val; }
837 };
838 template <> struct simplify_type<const IndexCall> {
839   using SimpleType = const PointerUnion<CallsiteInfo *, AllocInfo *>;
840   static SimpleType getSimplifiedValue(const IndexCall &Val) { return Val; }
841 };
842 } // namespace llvm
843 
844 namespace {
845 /// CRTP derived class for graphs built from summary index (ThinLTO).
846 class IndexCallsiteContextGraph
847     : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
848                                   IndexCall> {
849 public:
850   IndexCallsiteContextGraph(
851       ModuleSummaryIndex &Index,
852       llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
853           isPrevailing);
854 
855   ~IndexCallsiteContextGraph() {
856     // Now that we are done with the graph it is safe to add the new
857     // CallsiteInfo structs to the function summary vectors. The graph nodes
858     // point into locations within these vectors, so we don't want to add them
859     // any earlier.
860     for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
861       auto *FS = I.first;
862       for (auto &Callsite : I.second)
863         FS->addCallsite(*Callsite.second);
864     }
865   }
866 
867 private:
868   friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
869                               IndexCall>;
870 
871   uint64_t getStackId(uint64_t IdOrIndex) const;
872   const FunctionSummary *getCalleeFunc(IndexCall &Call);
873   bool calleeMatchesFunc(
874       IndexCall &Call, const FunctionSummary *Func,
875       const FunctionSummary *CallerFunc,
876       std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
877   bool sameCallee(IndexCall &Call1, IndexCall &Call2);
878   bool findProfiledCalleeThroughTailCalls(
879       ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
880       std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
881       bool &FoundMultipleCalleeChains);
882   uint64_t getLastStackId(IndexCall &Call);
883   std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
884   void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
885   AllocationType getAllocationCallType(const CallInfo &Call) const;
886   void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
887   CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
888                        IndexCall>::FuncInfo
889   cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
890                            std::map<CallInfo, CallInfo> &CallMap,
891                            std::vector<CallInfo> &CallsWithMetadataInFunc,
892                            unsigned CloneNo);
893   std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
894                        unsigned CloneNo) const;
895 
896   // Saves mapping from function summaries containing memprof records back to
897   // its VI, for use in checking and debugging.
898   std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
899 
900   const ModuleSummaryIndex &Index;
901   llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
902       isPrevailing;
903 
904   // Saves/owns the callsite info structures synthesized for missing tail call
905   // frames that we discover while building the graph.
906   // It maps from the summary of the function making the tail call, to a map
907   // of callee ValueInfo to corresponding synthesized callsite info.
908   std::unordered_map<FunctionSummary *,
909                      std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
910       FunctionCalleesToSynthesizedCallsiteInfos;
911 };
912 } // namespace
913 
914 namespace llvm {
915 template <>
916 struct DenseMapInfo<typename CallsiteContextGraph<
917     ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
918     : public DenseMapInfo<std::pair<Instruction *, unsigned>> {};
919 template <>
920 struct DenseMapInfo<typename CallsiteContextGraph<
921     IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
922     : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
923 template <>
924 struct DenseMapInfo<IndexCall>
925     : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
926 } // end namespace llvm
927 
928 namespace {
929 
930 // Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
931 // type we should actually use on the corresponding allocation.
932 // If we can't clone a node that has NotCold+Cold alloc type, we will fall
933 // back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
934 // from NotCold.
935 AllocationType allocTypeToUse(uint8_t AllocTypes) {
936   assert(AllocTypes != (uint8_t)AllocationType::None);
937   if (AllocTypes ==
938       ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
939     return AllocationType::NotCold;
940   else
941     return (AllocationType)AllocTypes;
942 }
943 
944 // Helper to check if the alloc types for all edges recorded in the
945 // InAllocTypes vector match the alloc types for all edges in the Edges
946 // vector.
947 template <typename DerivedCCG, typename FuncTy, typename CallTy>
948 bool allocTypesMatch(
949     const std::vector<uint8_t> &InAllocTypes,
950     const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
951         &Edges) {
952   // This should be called only when the InAllocTypes vector was computed for
953   // this set of Edges. Make sure the sizes are the same.
954   assert(InAllocTypes.size() == Edges.size());
955   return std::equal(
956       InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(),
957       [](const uint8_t &l,
958          const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
959         // Can share if one of the edges is None type - don't
960         // care about the type along that edge as it doesn't
961         // exist for those context ids.
962         if (l == (uint8_t)AllocationType::None ||
963             r->AllocTypes == (uint8_t)AllocationType::None)
964           return true;
965         return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
966       });
967 }
968 
969 // Helper to check if the alloc types for all edges recorded in the
970 // InAllocTypes vector match the alloc types for callee edges in the given
971 // clone. Because the InAllocTypes were computed from the original node's callee
972 // edges, and other cloning could have happened after this clone was created, we
973 // need to find the matching clone callee edge, which may or may not exist.
974 template <typename DerivedCCG, typename FuncTy, typename CallTy>
975 bool allocTypesMatchClone(
976     const std::vector<uint8_t> &InAllocTypes,
977     const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) {
978   const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf;
979   assert(Node);
980   // InAllocTypes should have been computed for the original node's callee
981   // edges.
982   assert(InAllocTypes.size() == Node->CalleeEdges.size());
983   // First create a map of the clone callee edge callees to the edge alloc type.
984   DenseMap<const ContextNode<DerivedCCG, FuncTy, CallTy> *, uint8_t>
985       EdgeCalleeMap;
986   for (const auto &E : Clone->CalleeEdges) {
987     assert(!EdgeCalleeMap.contains(E->Callee));
988     EdgeCalleeMap[E->Callee] = E->AllocTypes;
989   }
990   // Next, walk the original node's callees, and look for the corresponding
991   // clone edge to that callee.
992   for (unsigned I = 0; I < Node->CalleeEdges.size(); I++) {
993     auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee);
994     // Not found is ok, we will simply add an edge if we use this clone.
995     if (Iter == EdgeCalleeMap.end())
996       continue;
997     // Can share if one of the edges is None type - don't
998     // care about the type along that edge as it doesn't
999     // exist for those context ids.
1000     if (InAllocTypes[I] == (uint8_t)AllocationType::None ||
1001         Iter->second == (uint8_t)AllocationType::None)
1002       continue;
1003     if (allocTypeToUse(Iter->second) != allocTypeToUse(InAllocTypes[I]))
1004       return false;
1005   }
1006   return true;
1007 }
1008 
1009 } // end anonymous namespace
1010 
1011 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1012 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1013 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
1014     const CallInfo &C) {
1015   ContextNode *Node = getNodeForAlloc(C);
1016   if (Node)
1017     return Node;
1018 
1019   return NonAllocationCallToContextNodeMap.lookup(C);
1020 }
1021 
1022 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1023 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1024 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
1025     const CallInfo &C) {
1026   return AllocationCallToContextNodeMap.lookup(C);
1027 }
1028 
1029 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1030 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1031 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
1032     uint64_t StackId) {
1033   auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
1034   if (StackEntryNode != StackEntryIdToContextNodeMap.end())
1035     return StackEntryNode->second;
1036   return nullptr;
1037 }
1038 
1039 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1040 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1041     addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
1042                           unsigned int ContextId) {
1043   for (auto &Edge : CallerEdges) {
1044     if (Edge->Caller == Caller) {
1045       Edge->AllocTypes |= (uint8_t)AllocType;
1046       Edge->getContextIds().insert(ContextId);
1047       return;
1048     }
1049   }
1050   std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
1051       this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
1052   CallerEdges.push_back(Edge);
1053   Caller->CalleeEdges.push_back(Edge);
1054 }
1055 
1056 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1057 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph(
1058     ContextEdge *Edge, EdgeIter *EI, bool CalleeIter) {
1059   assert(!EI || (*EI)->get() == Edge);
1060   // Save the Caller and Callee pointers so we can erase Edge from their edge
1061   // lists after clearing Edge below. We do the clearing first in case it is
1062   // destructed after removing from the edge lists (if those were the last
1063   // shared_ptr references to Edge).
1064   auto *Callee = Edge->Callee;
1065   auto *Caller = Edge->Caller;
1066 
1067   // Make sure the edge fields are cleared out so we can properly detect
1068   // removed edges if Edge is not destructed because there is still a shared_ptr
1069   // reference.
1070   Edge->clear();
1071 
1072   if (!EI) {
1073     Callee->eraseCallerEdge(Edge);
1074     Caller->eraseCalleeEdge(Edge);
1075   } else if (CalleeIter) {
1076     Callee->eraseCallerEdge(Edge);
1077     *EI = Caller->CalleeEdges.erase(*EI);
1078   } else {
1079     Caller->eraseCalleeEdge(Edge);
1080     *EI = Callee->CallerEdges.erase(*EI);
1081   }
1082 }
1083 
1084 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1085 void CallsiteContextGraph<
1086     DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
1087   for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1088     auto Edge = *EI;
1089     if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1090       assert(Edge->ContextIds.empty());
1091       removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
1092     } else
1093       ++EI;
1094   }
1095 }
1096 
1097 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1098 void CallsiteContextGraph<
1099     DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) {
1100   for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
1101     auto Edge = *EI;
1102     if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1103       assert(Edge->ContextIds.empty());
1104       Edge->Caller->eraseCalleeEdge(Edge.get());
1105       EI = Node->CallerEdges.erase(EI);
1106     } else
1107       ++EI;
1108   }
1109 }
1110 
1111 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1112 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1113 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1114     findEdgeFromCallee(const ContextNode *Callee) {
1115   for (const auto &Edge : CalleeEdges)
1116     if (Edge->Callee == Callee)
1117       return Edge.get();
1118   return nullptr;
1119 }
1120 
1121 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1122 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1123 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1124     findEdgeFromCaller(const ContextNode *Caller) {
1125   for (const auto &Edge : CallerEdges)
1126     if (Edge->Caller == Caller)
1127       return Edge.get();
1128   return nullptr;
1129 }
1130 
1131 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1132 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1133     eraseCalleeEdge(const ContextEdge *Edge) {
1134   auto EI = llvm::find_if(
1135       CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
1136         return CalleeEdge.get() == Edge;
1137       });
1138   assert(EI != CalleeEdges.end());
1139   CalleeEdges.erase(EI);
1140 }
1141 
1142 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1143 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
1144     eraseCallerEdge(const ContextEdge *Edge) {
1145   auto EI = llvm::find_if(
1146       CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
1147         return CallerEdge.get() == Edge;
1148       });
1149   assert(EI != CallerEdges.end());
1150   CallerEdges.erase(EI);
1151 }
1152 
1153 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1154 uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
1155     DenseSet<uint32_t> &ContextIds) const {
1156   uint8_t BothTypes =
1157       (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1158   uint8_t AllocType = (uint8_t)AllocationType::None;
1159   for (auto Id : ContextIds) {
1160     AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
1161     // Bail early if alloc type reached both, no further refinement.
1162     if (AllocType == BothTypes)
1163       return AllocType;
1164   }
1165   return AllocType;
1166 }
1167 
1168 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1169 uint8_t
1170 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
1171     const DenseSet<uint32_t> &Node1Ids,
1172     const DenseSet<uint32_t> &Node2Ids) const {
1173   uint8_t BothTypes =
1174       (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1175   uint8_t AllocType = (uint8_t)AllocationType::None;
1176   for (auto Id : Node1Ids) {
1177     if (!Node2Ids.count(Id))
1178       continue;
1179     AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
1180     // Bail early if alloc type reached both, no further refinement.
1181     if (AllocType == BothTypes)
1182       return AllocType;
1183   }
1184   return AllocType;
1185 }
1186 
1187 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1188 uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
1189     const DenseSet<uint32_t> &Node1Ids,
1190     const DenseSet<uint32_t> &Node2Ids) const {
1191   if (Node1Ids.size() < Node2Ids.size())
1192     return intersectAllocTypesImpl(Node1Ids, Node2Ids);
1193   else
1194     return intersectAllocTypesImpl(Node2Ids, Node1Ids);
1195 }
1196 
1197 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1198 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1199 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
1200     CallInfo Call, const FuncTy *F) {
1201   assert(!getNodeForAlloc(Call));
1202   ContextNode *AllocNode = createNewNode(/*IsAllocation=*/true, F, Call);
1203   AllocationCallToContextNodeMap[Call] = AllocNode;
1204   // Use LastContextId as a uniq id for MIB allocation nodes.
1205   AllocNode->OrigStackOrAllocId = LastContextId;
1206   // Alloc type should be updated as we add in the MIBs. We should assert
1207   // afterwards that it is not still None.
1208   AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1209 
1210   return AllocNode;
1211 }
1212 
1213 static std::string getAllocTypeString(uint8_t AllocTypes) {
1214   if (!AllocTypes)
1215     return "None";
1216   std::string Str;
1217   if (AllocTypes & (uint8_t)AllocationType::NotCold)
1218     Str += "NotCold";
1219   if (AllocTypes & (uint8_t)AllocationType::Cold)
1220     Str += "Cold";
1221   return Str;
1222 }
1223 
1224 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1225 template <class NodeT, class IteratorT>
1226 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1227     ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1228     CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
1229     ArrayRef<ContextTotalSize> ContextSizeInfo) {
1230   // Treating the hot alloc type as NotCold before the disambiguation for "hot"
1231   // is done.
1232   if (AllocType == AllocationType::Hot)
1233     AllocType = AllocationType::NotCold;
1234 
1235   ContextIdToAllocationType[++LastContextId] = AllocType;
1236 
1237   if (!ContextSizeInfo.empty()) {
1238     auto &Entry = ContextIdToContextSizeInfos[LastContextId];
1239     Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
1240   }
1241 
1242   // Update alloc type and context ids for this MIB.
1243   AllocNode->AllocTypes |= (uint8_t)AllocType;
1244 
1245   // Now add or update nodes for each stack id in alloc's context.
1246   // Later when processing the stack ids on non-alloc callsites we will adjust
1247   // for any inlining in the context.
1248   ContextNode *PrevNode = AllocNode;
1249   // Look for recursion (direct recursion should have been collapsed by
1250   // module summary analysis, here we should just be detecting mutual
1251   // recursion). Mark these nodes so we don't try to clone.
1252   SmallSet<uint64_t, 8> StackIdSet;
1253   // Skip any on the allocation call (inlining).
1254   for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1255        ContextIter != StackContext.end(); ++ContextIter) {
1256     auto StackId = getStackId(*ContextIter);
1257     ContextNode *StackNode = getNodeForStackId(StackId);
1258     if (!StackNode) {
1259       StackNode = createNewNode(/*IsAllocation=*/false);
1260       StackEntryIdToContextNodeMap[StackId] = StackNode;
1261       StackNode->OrigStackOrAllocId = StackId;
1262     }
1263     // Marking a node recursive will prevent its cloning completely, even for
1264     // non-recursive contexts flowing through it.
1265     if (!AllowRecursiveCallsites) {
1266       auto Ins = StackIdSet.insert(StackId);
1267       if (!Ins.second)
1268         StackNode->Recursive = true;
1269     }
1270     StackNode->AllocTypes |= (uint8_t)AllocType;
1271     PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1272     PrevNode = StackNode;
1273   }
1274 }
1275 
1276 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1277 DenseSet<uint32_t>
1278 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1279     const DenseSet<uint32_t> &StackSequenceContextIds,
1280     DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1281   DenseSet<uint32_t> NewContextIds;
1282   for (auto OldId : StackSequenceContextIds) {
1283     NewContextIds.insert(++LastContextId);
1284     OldToNewContextIds[OldId].insert(LastContextId);
1285     assert(ContextIdToAllocationType.count(OldId));
1286     // The new context has the same allocation type as original.
1287     ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1288   }
1289   return NewContextIds;
1290 }
1291 
1292 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1293 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1294     propagateDuplicateContextIds(
1295         const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1296   // Build a set of duplicated context ids corresponding to the input id set.
1297   auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1298     DenseSet<uint32_t> NewIds;
1299     for (auto Id : ContextIds)
1300       if (auto NewId = OldToNewContextIds.find(Id);
1301           NewId != OldToNewContextIds.end())
1302         NewIds.insert(NewId->second.begin(), NewId->second.end());
1303     return NewIds;
1304   };
1305 
1306   // Recursively update context ids sets along caller edges.
1307   auto UpdateCallers = [&](ContextNode *Node,
1308                            DenseSet<const ContextEdge *> &Visited,
1309                            auto &&UpdateCallers) -> void {
1310     for (const auto &Edge : Node->CallerEdges) {
1311       auto Inserted = Visited.insert(Edge.get());
1312       if (!Inserted.second)
1313         continue;
1314       ContextNode *NextNode = Edge->Caller;
1315       DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1316       // Only need to recursively iterate to NextNode via this caller edge if
1317       // it resulted in any added ids to NextNode.
1318       if (!NewIdsToAdd.empty()) {
1319         Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1320         UpdateCallers(NextNode, Visited, UpdateCallers);
1321       }
1322     }
1323   };
1324 
1325   DenseSet<const ContextEdge *> Visited;
1326   for (auto &Entry : AllocationCallToContextNodeMap) {
1327     auto *Node = Entry.second;
1328     UpdateCallers(Node, Visited, UpdateCallers);
1329   }
1330 }
1331 
1332 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1333 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1334     ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
1335     // This must be passed by value to make a copy since it will be adjusted
1336     // as ids are moved.
1337     DenseSet<uint32_t> RemainingContextIds) {
1338   auto &OrigEdges =
1339       TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1340   // Increment iterator in loop so that we can remove edges as needed.
1341   for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1342     auto Edge = *EI;
1343     // Remove any matching context ids from Edge, return set that were found and
1344     // removed, these are the new edge's context ids. Also update the remaining
1345     // (not found ids).
1346     DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds;
1347     set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1348                  NotFoundContextIds);
1349     RemainingContextIds.swap(NotFoundContextIds);
1350     // If no matching context ids for this edge, skip it.
1351     if (NewEdgeContextIds.empty()) {
1352       ++EI;
1353       continue;
1354     }
1355     if (TowardsCallee) {
1356       uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1357       auto NewEdge = std::make_shared<ContextEdge>(
1358           Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1359       NewNode->CalleeEdges.push_back(NewEdge);
1360       NewEdge->Callee->CallerEdges.push_back(NewEdge);
1361     } else {
1362       uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1363       auto NewEdge = std::make_shared<ContextEdge>(
1364           NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1365       NewNode->CallerEdges.push_back(NewEdge);
1366       NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1367     }
1368     // Remove old edge if context ids empty.
1369     if (Edge->getContextIds().empty()) {
1370       removeEdgeFromGraph(Edge.get(), &EI, TowardsCallee);
1371       continue;
1372     }
1373     ++EI;
1374   }
1375 }
1376 
1377 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1378 static void checkEdge(
1379     const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1380   // Confirm that alloc type is not None and that we have at least one context
1381   // id.
1382   assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1383   assert(!Edge->ContextIds.empty());
1384 }
1385 
1386 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1387 static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1388                       bool CheckEdges = true) {
1389   if (Node->isRemoved())
1390     return;
1391 #ifndef NDEBUG
1392   // Compute node's context ids once for use in asserts.
1393   auto NodeContextIds = Node->getContextIds();
1394 #endif
1395   // Node's context ids should be the union of both its callee and caller edge
1396   // context ids.
1397   if (Node->CallerEdges.size()) {
1398     DenseSet<uint32_t> CallerEdgeContextIds(
1399         Node->CallerEdges.front()->ContextIds);
1400     for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1401       if (CheckEdges)
1402         checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1403       set_union(CallerEdgeContextIds, Edge->ContextIds);
1404     }
1405     // Node can have more context ids than callers if some contexts terminate at
1406     // node and some are longer. If we are allowing recursive callsites but
1407     // haven't disabled recursive contexts, this will be violated for
1408     // incompletely cloned recursive cycles, so skip the checking in that case.
1409     assert((AllowRecursiveCallsites && AllowRecursiveContexts) ||
1410            NodeContextIds == CallerEdgeContextIds ||
1411            set_is_subset(CallerEdgeContextIds, NodeContextIds));
1412   }
1413   if (Node->CalleeEdges.size()) {
1414     DenseSet<uint32_t> CalleeEdgeContextIds(
1415         Node->CalleeEdges.front()->ContextIds);
1416     for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1417       if (CheckEdges)
1418         checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1419       set_union(CalleeEdgeContextIds, Edge->getContextIds());
1420     }
1421     assert(NodeContextIds == CalleeEdgeContextIds);
1422   }
1423   // FIXME: Since this checking is only invoked under an option, we should
1424   // change the error checking from using assert to something that will trigger
1425   // an error on a release build.
1426 #ifndef NDEBUG
1427   // Make sure we don't end up with duplicate edges between the same caller and
1428   // callee.
1429   DenseSet<ContextNode<DerivedCCG, FuncTy, CallTy> *> NodeSet;
1430   for (const auto &E : Node->CalleeEdges)
1431     NodeSet.insert(E->Callee);
1432   assert(NodeSet.size() == Node->CalleeEdges.size());
1433 #endif
1434 }
1435 
1436 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1437 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1438     assignStackNodesPostOrder(
1439         ContextNode *Node, DenseSet<const ContextNode *> &Visited,
1440         DenseMap<uint64_t, std::vector<CallContextInfo>>
1441             &StackIdToMatchingCalls,
1442         DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
1443   auto Inserted = Visited.insert(Node);
1444   if (!Inserted.second)
1445     return;
1446   // Post order traversal. Iterate over a copy since we may add nodes and
1447   // therefore new callers during the recursive call, invalidating any
1448   // iterator over the original edge vector. We don't need to process these
1449   // new nodes as they were already processed on creation.
1450   auto CallerEdges = Node->CallerEdges;
1451   for (auto &Edge : CallerEdges) {
1452     // Skip any that have been removed during the recursion.
1453     if (Edge->isRemoved()) {
1454       assert(!is_contained(Node->CallerEdges, Edge));
1455       continue;
1456     }
1457     assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
1458                               CallToMatchingCall);
1459   }
1460 
1461   // If this node's stack id is in the map, update the graph to contain new
1462   // nodes representing any inlining at interior callsites. Note we move the
1463   // associated context ids over to the new nodes.
1464 
1465   // Ignore this node if it is for an allocation or we didn't record any
1466   // stack id lists ending at it.
1467   if (Node->IsAllocation ||
1468       !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1469     return;
1470 
1471   auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1472   // Handle the simple case first. A single call with a single stack id.
1473   // In this case there is no need to create any new context nodes, simply
1474   // assign the context node for stack id to this Call.
1475   if (Calls.size() == 1) {
1476     auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1477     if (Ids.size() == 1) {
1478       assert(SavedContextIds.empty());
1479       // It should be this Node
1480       assert(Node == getNodeForStackId(Ids[0]));
1481       if (Node->Recursive)
1482         return;
1483       Node->setCall(Call);
1484       NonAllocationCallToContextNodeMap[Call] = Node;
1485       NodeToCallingFunc[Node] = Func;
1486       return;
1487     }
1488   }
1489 
1490 #ifndef NDEBUG
1491   // Find the node for the last stack id, which should be the same
1492   // across all calls recorded for this id, and is this node's id.
1493   uint64_t LastId = Node->OrigStackOrAllocId;
1494   ContextNode *LastNode = getNodeForStackId(LastId);
1495   // We should only have kept stack ids that had nodes.
1496   assert(LastNode);
1497   assert(LastNode == Node);
1498 #else
1499   ContextNode *LastNode = Node;
1500 #endif
1501 
1502   // Compute the last node's context ids once, as it is shared by all calls in
1503   // this entry.
1504   DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1505 
1506   bool PrevIterCreatedNode = false;
1507   bool CreatedNode = false;
1508   for (unsigned I = 0; I < Calls.size();
1509        I++, PrevIterCreatedNode = CreatedNode) {
1510     CreatedNode = false;
1511     auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1512     // Skip any for which we didn't assign any ids, these don't get a node in
1513     // the graph.
1514     if (SavedContextIds.empty()) {
1515       // If this call has a matching call (located in the same function and
1516       // having the same stack ids), simply add it to the context node created
1517       // for its matching call earlier. These can be treated the same through
1518       // cloning and get updated at the same time.
1519       if (!CallToMatchingCall.contains(Call))
1520         continue;
1521       auto MatchingCall = CallToMatchingCall[Call];
1522       if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
1523         // This should only happen if we had a prior iteration, and it didn't
1524         // create a node because of the below recomputation of context ids
1525         // finding none remaining and continuing early.
1526         assert(I > 0 && !PrevIterCreatedNode);
1527         continue;
1528       }
1529       NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
1530           Call);
1531       continue;
1532     }
1533 
1534     assert(LastId == Ids.back());
1535 
1536     // Recompute the context ids for this stack id sequence (the
1537     // intersection of the context ids of the corresponding nodes).
1538     // Start with the ids we saved in the map for this call, which could be
1539     // duplicated context ids. We have to recompute as we might have overlap
1540     // overlap between the saved context ids for different last nodes, and
1541     // removed them already during the post order traversal.
1542     set_intersect(SavedContextIds, LastNodeContextIds);
1543     ContextNode *PrevNode = LastNode;
1544     bool Skip = false;
1545     // Iterate backwards through the stack Ids, starting after the last Id
1546     // in the list, which was handled once outside for all Calls.
1547     for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1548       auto Id = *IdIter;
1549       ContextNode *CurNode = getNodeForStackId(Id);
1550       // We should only have kept stack ids that had nodes and weren't
1551       // recursive.
1552       assert(CurNode);
1553       assert(!CurNode->Recursive);
1554 
1555       auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1556       if (!Edge) {
1557         Skip = true;
1558         break;
1559       }
1560       PrevNode = CurNode;
1561 
1562       // Update the context ids, which is the intersection of the ids along
1563       // all edges in the sequence.
1564       set_intersect(SavedContextIds, Edge->getContextIds());
1565 
1566       // If we now have no context ids for clone, skip this call.
1567       if (SavedContextIds.empty()) {
1568         Skip = true;
1569         break;
1570       }
1571     }
1572     if (Skip)
1573       continue;
1574 
1575     // Create new context node.
1576     ContextNode *NewNode = createNewNode(/*IsAllocation=*/false, Func, Call);
1577     NonAllocationCallToContextNodeMap[Call] = NewNode;
1578     CreatedNode = true;
1579     NewNode->AllocTypes = computeAllocType(SavedContextIds);
1580 
1581     ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1582     assert(FirstNode);
1583 
1584     // Connect to callees of innermost stack frame in inlined call chain.
1585     // This updates context ids for FirstNode's callee's to reflect those
1586     // moved to NewNode.
1587     connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds);
1588 
1589     // Connect to callers of outermost stack frame in inlined call chain.
1590     // This updates context ids for FirstNode's caller's to reflect those
1591     // moved to NewNode.
1592     connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds);
1593 
1594     // Now we need to remove context ids from edges/nodes between First and
1595     // Last Node.
1596     PrevNode = nullptr;
1597     for (auto Id : Ids) {
1598       ContextNode *CurNode = getNodeForStackId(Id);
1599       // We should only have kept stack ids that had nodes.
1600       assert(CurNode);
1601 
1602       // Remove the context ids moved to NewNode from CurNode, and the
1603       // edge from the prior node.
1604       if (PrevNode) {
1605         auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1606         assert(PrevEdge);
1607         set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1608         if (PrevEdge->getContextIds().empty())
1609           removeEdgeFromGraph(PrevEdge);
1610       }
1611       // Since we update the edges from leaf to tail, only look at the callee
1612       // edges. This isn't an alloc node, so if there are no callee edges, the
1613       // alloc type is None.
1614       CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1615                                 ? (uint8_t)AllocationType::None
1616                                 : CurNode->computeAllocType();
1617       PrevNode = CurNode;
1618     }
1619     if (VerifyNodes) {
1620       checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
1621       for (auto Id : Ids) {
1622         ContextNode *CurNode = getNodeForStackId(Id);
1623         // We should only have kept stack ids that had nodes.
1624         assert(CurNode);
1625         checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
1626       }
1627     }
1628   }
1629 }
1630 
1631 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1632 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1633   // Map of stack id to all calls with that as the last (outermost caller)
1634   // callsite id that has a context node (some might not due to pruning
1635   // performed during matching of the allocation profile contexts).
1636   // The CallContextInfo contains the Call and a list of its stack ids with
1637   // ContextNodes, the function containing Call, and the set of context ids
1638   // the analysis will eventually identify for use in any new node created
1639   // for that callsite.
1640   DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
1641   for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1642     for (auto &Call : CallsWithMetadata) {
1643       // Ignore allocations, already handled.
1644       if (AllocationCallToContextNodeMap.count(Call))
1645         continue;
1646       auto StackIdsWithContextNodes =
1647           getStackIdsWithContextNodesForCall(Call.call());
1648       // If there were no nodes created for MIBs on allocs (maybe this was in
1649       // the unambiguous part of the MIB stack that was pruned), ignore.
1650       if (StackIdsWithContextNodes.empty())
1651         continue;
1652       // Otherwise, record this Call along with the list of ids for the last
1653       // (outermost caller) stack id with a node.
1654       StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1655           {Call.call(), StackIdsWithContextNodes, Func, {}});
1656     }
1657   }
1658 
1659   // First make a pass through all stack ids that correspond to a call,
1660   // as identified in the above loop. Compute the context ids corresponding to
1661   // each of these calls when they correspond to multiple stack ids due to
1662   // due to inlining. Perform any duplication of context ids required when
1663   // there is more than one call with the same stack ids. Their (possibly newly
1664   // duplicated) context ids are saved in the StackIdToMatchingCalls map.
1665   DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1666   // Save a map from each call to any that are found to match it. I.e. located
1667   // in the same function and have the same (possibly pruned) stack ids. We use
1668   // this to avoid creating extra graph nodes as they can be treated the same.
1669   DenseMap<CallInfo, CallInfo> CallToMatchingCall;
1670   for (auto &It : StackIdToMatchingCalls) {
1671     auto &Calls = It.getSecond();
1672     // Skip single calls with a single stack id. These don't need a new node.
1673     if (Calls.size() == 1) {
1674       auto &Ids = Calls[0].StackIds;
1675       if (Ids.size() == 1)
1676         continue;
1677     }
1678     // In order to do the best and maximal matching of inlined calls to context
1679     // node sequences we will sort the vectors of stack ids in descending order
1680     // of length, and within each length, lexicographically by stack id. The
1681     // latter is so that we can specially handle calls that have identical stack
1682     // id sequences (either due to cloning or artificially because of the MIB
1683     // context pruning). Those with the same Ids are then sorted by function to
1684     // facilitate efficiently mapping them to the same context node.
1685     // Because the functions are pointers, to ensure a stable sort first assign
1686     // each function pointer to its first index in the Calls array, and then use
1687     // that to sort by.
1688     DenseMap<const FuncTy *, unsigned> FuncToIndex;
1689     for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
1690       FuncToIndex.insert({CallCtxInfo.Func, Idx});
1691     std::stable_sort(
1692         Calls.begin(), Calls.end(),
1693         [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
1694           return A.StackIds.size() > B.StackIds.size() ||
1695                  (A.StackIds.size() == B.StackIds.size() &&
1696                   (A.StackIds < B.StackIds ||
1697                    (A.StackIds == B.StackIds &&
1698                     FuncToIndex[A.Func] < FuncToIndex[B.Func])));
1699         });
1700 
1701     // Find the node for the last stack id, which should be the same
1702     // across all calls recorded for this id, and is the id for this
1703     // entry in the StackIdToMatchingCalls map.
1704     uint64_t LastId = It.getFirst();
1705     ContextNode *LastNode = getNodeForStackId(LastId);
1706     // We should only have kept stack ids that had nodes.
1707     assert(LastNode);
1708 
1709     if (LastNode->Recursive)
1710       continue;
1711 
1712     // Initialize the context ids with the last node's. We will subsequently
1713     // refine the context ids by computing the intersection along all edges.
1714     DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1715     assert(!LastNodeContextIds.empty());
1716 
1717 #ifndef NDEBUG
1718     // Save the set of functions seen for a particular set of the same stack
1719     // ids. This is used to ensure that they have been correctly sorted to be
1720     // adjacent in the Calls list, since we rely on that to efficiently place
1721     // all such matching calls onto the same context node.
1722     DenseSet<const FuncTy *> MatchingIdsFuncSet;
1723 #endif
1724 
1725     for (unsigned I = 0; I < Calls.size(); I++) {
1726       auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1727       assert(SavedContextIds.empty());
1728       assert(LastId == Ids.back());
1729 
1730 #ifndef NDEBUG
1731       // If this call has a different set of ids than the last one, clear the
1732       // set used to ensure they are sorted properly.
1733       if (I > 0 && Ids != Calls[I - 1].StackIds)
1734         MatchingIdsFuncSet.clear();
1735 #endif
1736 
1737       // First compute the context ids for this stack id sequence (the
1738       // intersection of the context ids of the corresponding nodes).
1739       // Start with the remaining saved ids for the last node.
1740       assert(!LastNodeContextIds.empty());
1741       DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1742 
1743       ContextNode *PrevNode = LastNode;
1744       ContextNode *CurNode = LastNode;
1745       bool Skip = false;
1746 
1747       // Iterate backwards through the stack Ids, starting after the last Id
1748       // in the list, which was handled once outside for all Calls.
1749       for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1750         auto Id = *IdIter;
1751         CurNode = getNodeForStackId(Id);
1752         // We should only have kept stack ids that had nodes.
1753         assert(CurNode);
1754 
1755         if (CurNode->Recursive) {
1756           Skip = true;
1757           break;
1758         }
1759 
1760         auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1761         // If there is no edge then the nodes belong to different MIB contexts,
1762         // and we should skip this inlined context sequence. For example, this
1763         // particular inlined context may include stack ids A->B, and we may
1764         // indeed have nodes for both A and B, but it is possible that they were
1765         // never profiled in sequence in a single MIB for any allocation (i.e.
1766         // we might have profiled an allocation that involves the callsite A,
1767         // but through a different one of its callee callsites, and we might
1768         // have profiled an allocation that involves callsite B, but reached
1769         // from a different caller callsite).
1770         if (!Edge) {
1771           Skip = true;
1772           break;
1773         }
1774         PrevNode = CurNode;
1775 
1776         // Update the context ids, which is the intersection of the ids along
1777         // all edges in the sequence.
1778         set_intersect(StackSequenceContextIds, Edge->getContextIds());
1779 
1780         // If we now have no context ids for clone, skip this call.
1781         if (StackSequenceContextIds.empty()) {
1782           Skip = true;
1783           break;
1784         }
1785       }
1786       if (Skip)
1787         continue;
1788 
1789       // If some of this call's stack ids did not have corresponding nodes (due
1790       // to pruning), don't include any context ids for contexts that extend
1791       // beyond these nodes. Otherwise we would be matching part of unrelated /
1792       // not fully matching stack contexts. To do this, subtract any context ids
1793       // found in caller nodes of the last node found above.
1794       if (Ids.back() != getLastStackId(Call)) {
1795         for (const auto &PE : LastNode->CallerEdges) {
1796           set_subtract(StackSequenceContextIds, PE->getContextIds());
1797           if (StackSequenceContextIds.empty())
1798             break;
1799         }
1800         // If we now have no context ids for clone, skip this call.
1801         if (StackSequenceContextIds.empty())
1802           continue;
1803       }
1804 
1805 #ifndef NDEBUG
1806       // If the prior call had the same stack ids this set would not be empty.
1807       // Check if we already have a call that "matches" because it is located
1808       // in the same function. If the Calls list was sorted properly we should
1809       // not encounter this situation as all such entries should be adjacent
1810       // and processed in bulk further below.
1811       assert(!MatchingIdsFuncSet.contains(Func));
1812 
1813       MatchingIdsFuncSet.insert(Func);
1814 #endif
1815 
1816       // Check if the next set of stack ids is the same (since the Calls vector
1817       // of tuples is sorted by the stack ids we can just look at the next one).
1818       // If so, save them in the CallToMatchingCall map so that they get
1819       // assigned to the same context node, and skip them.
1820       bool DuplicateContextIds = false;
1821       for (unsigned J = I + 1; J < Calls.size(); J++) {
1822         auto &CallCtxInfo = Calls[J];
1823         auto &NextIds = CallCtxInfo.StackIds;
1824         if (NextIds != Ids)
1825           break;
1826         auto *NextFunc = CallCtxInfo.Func;
1827         if (NextFunc != Func) {
1828           // We have another Call with the same ids but that cannot share this
1829           // node, must duplicate ids for it.
1830           DuplicateContextIds = true;
1831           break;
1832         }
1833         auto &NextCall = CallCtxInfo.Call;
1834         CallToMatchingCall[NextCall] = Call;
1835         // Update I so that it gets incremented correctly to skip this call.
1836         I = J;
1837       }
1838 
1839       // If we don't have duplicate context ids, then we can assign all the
1840       // context ids computed for the original node sequence to this call.
1841       // If there are duplicate calls with the same stack ids then we synthesize
1842       // new context ids that are duplicates of the originals. These are
1843       // assigned to SavedContextIds, which is a reference into the map entry
1844       // for this call, allowing us to access these ids later on.
1845       OldToNewContextIds.reserve(OldToNewContextIds.size() +
1846                                  StackSequenceContextIds.size());
1847       SavedContextIds =
1848           DuplicateContextIds
1849               ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
1850               : StackSequenceContextIds;
1851       assert(!SavedContextIds.empty());
1852 
1853       if (!DuplicateContextIds) {
1854         // Update saved last node's context ids to remove those that are
1855         // assigned to other calls, so that it is ready for the next call at
1856         // this stack id.
1857         set_subtract(LastNodeContextIds, StackSequenceContextIds);
1858         if (LastNodeContextIds.empty())
1859           break;
1860       }
1861     }
1862   }
1863 
1864   // Propagate the duplicate context ids over the graph.
1865   propagateDuplicateContextIds(OldToNewContextIds);
1866 
1867   if (VerifyCCG)
1868     check();
1869 
1870   // Now perform a post-order traversal over the graph, starting with the
1871   // allocation nodes, essentially processing nodes from callers to callees.
1872   // For any that contains an id in the map, update the graph to contain new
1873   // nodes representing any inlining at interior callsites. Note we move the
1874   // associated context ids over to the new nodes.
1875   DenseSet<const ContextNode *> Visited;
1876   for (auto &Entry : AllocationCallToContextNodeMap)
1877     assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
1878                               CallToMatchingCall);
1879   if (VerifyCCG)
1880     check();
1881 }
1882 
1883 uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
1884   CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
1885       Call->getMetadata(LLVMContext::MD_callsite));
1886   return CallsiteContext.back();
1887 }
1888 
1889 uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
1890   assert(isa<CallsiteInfo *>(Call));
1891   CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
1892       CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
1893   // Need to convert index into stack id.
1894   return Index.getStackIdAtIndex(CallsiteContext.back());
1895 }
1896 
1897 static const std::string MemProfCloneSuffix = ".memprof.";
1898 
1899 static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
1900   // We use CloneNo == 0 to refer to the original version, which doesn't get
1901   // renamed with a suffix.
1902   if (!CloneNo)
1903     return Base.str();
1904   return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
1905 }
1906 
1907 static bool isMemProfClone(const Function &F) {
1908   return F.getName().contains(MemProfCloneSuffix);
1909 }
1910 
1911 std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
1912                                                  const Instruction *Call,
1913                                                  unsigned CloneNo) const {
1914   return (Twine(Call->getFunction()->getName()) + " -> " +
1915           cast<CallBase>(Call)->getCalledFunction()->getName())
1916       .str();
1917 }
1918 
1919 std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
1920                                                 const IndexCall &Call,
1921                                                 unsigned CloneNo) const {
1922   auto VI = FSToVIMap.find(Func);
1923   assert(VI != FSToVIMap.end());
1924   if (isa<AllocInfo *>(Call))
1925     return (VI->second.name() + " -> alloc").str();
1926   else {
1927     auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call);
1928     return (VI->second.name() + " -> " +
1929             getMemProfFuncName(Callsite->Callee.name(),
1930                                Callsite->Clones[CloneNo]))
1931         .str();
1932   }
1933 }
1934 
1935 std::vector<uint64_t>
1936 ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
1937     Instruction *Call) {
1938   CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
1939       Call->getMetadata(LLVMContext::MD_callsite));
1940   return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
1941       CallsiteContext);
1942 }
1943 
1944 std::vector<uint64_t>
1945 IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
1946   assert(isa<CallsiteInfo *>(Call));
1947   CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
1948       CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
1949   return getStackIdsWithContextNodes<CallsiteInfo,
1950                                      SmallVector<unsigned>::const_iterator>(
1951       CallsiteContext);
1952 }
1953 
1954 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1955 template <class NodeT, class IteratorT>
1956 std::vector<uint64_t>
1957 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
1958     CallStack<NodeT, IteratorT> &CallsiteContext) {
1959   std::vector<uint64_t> StackIds;
1960   for (auto IdOrIndex : CallsiteContext) {
1961     auto StackId = getStackId(IdOrIndex);
1962     ContextNode *Node = getNodeForStackId(StackId);
1963     if (!Node)
1964       break;
1965     StackIds.push_back(StackId);
1966   }
1967   return StackIds;
1968 }
1969 
1970 ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
1971     Module &M,
1972     llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
1973     : Mod(M), OREGetter(OREGetter) {
1974   for (auto &F : M) {
1975     std::vector<CallInfo> CallsWithMetadata;
1976     for (auto &BB : F) {
1977       for (auto &I : BB) {
1978         if (!isa<CallBase>(I))
1979           continue;
1980         if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
1981           CallsWithMetadata.push_back(&I);
1982           auto *AllocNode = addAllocNode(&I, &F);
1983           auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
1984           assert(CallsiteMD);
1985           CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
1986           // Add all of the MIBs and their stack nodes.
1987           for (auto &MDOp : MemProfMD->operands()) {
1988             auto *MIBMD = cast<const MDNode>(MDOp);
1989             std::vector<ContextTotalSize> ContextSizeInfo;
1990             // Collect the context size information if it exists.
1991             if (MIBMD->getNumOperands() > 2) {
1992               for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
1993                 MDNode *ContextSizePair =
1994                     dyn_cast<MDNode>(MIBMD->getOperand(I));
1995                 assert(ContextSizePair->getNumOperands() == 2);
1996                 uint64_t FullStackId = mdconst::dyn_extract<ConstantInt>(
1997                                            ContextSizePair->getOperand(0))
1998                                            ->getZExtValue();
1999                 uint64_t TotalSize = mdconst::dyn_extract<ConstantInt>(
2000                                          ContextSizePair->getOperand(1))
2001                                          ->getZExtValue();
2002                 ContextSizeInfo.push_back({FullStackId, TotalSize});
2003               }
2004             }
2005             MDNode *StackNode = getMIBStackNode(MIBMD);
2006             assert(StackNode);
2007             CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
2008             addStackNodesForMIB<MDNode, MDNode::op_iterator>(
2009                 AllocNode, StackContext, CallsiteContext,
2010                 getMIBAllocType(MIBMD), ContextSizeInfo);
2011           }
2012           assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2013           // Memprof and callsite metadata on memory allocations no longer
2014           // needed.
2015           I.setMetadata(LLVMContext::MD_memprof, nullptr);
2016           I.setMetadata(LLVMContext::MD_callsite, nullptr);
2017         }
2018         // For callsite metadata, add to list for this function for later use.
2019         else if (I.getMetadata(LLVMContext::MD_callsite)) {
2020           CallsWithMetadata.push_back(&I);
2021         }
2022       }
2023     }
2024     if (!CallsWithMetadata.empty())
2025       FuncToCallsWithMetadata[&F] = CallsWithMetadata;
2026   }
2027 
2028   if (DumpCCG) {
2029     dbgs() << "CCG before updating call stack chains:\n";
2030     dbgs() << *this;
2031   }
2032 
2033   if (ExportToDot)
2034     exportToDot("prestackupdate");
2035 
2036   updateStackNodes();
2037 
2038   handleCallsitesWithMultipleTargets();
2039 
2040   // Strip off remaining callsite metadata, no longer needed.
2041   for (auto &FuncEntry : FuncToCallsWithMetadata)
2042     for (auto &Call : FuncEntry.second)
2043       Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
2044 }
2045 
2046 IndexCallsiteContextGraph::IndexCallsiteContextGraph(
2047     ModuleSummaryIndex &Index,
2048     llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
2049         isPrevailing)
2050     : Index(Index), isPrevailing(isPrevailing) {
2051   for (auto &I : Index) {
2052     auto VI = Index.getValueInfo(I);
2053     for (auto &S : VI.getSummaryList()) {
2054       // We should only add the prevailing nodes. Otherwise we may try to clone
2055       // in a weak copy that won't be linked (and may be different than the
2056       // prevailing version).
2057       // We only keep the memprof summary on the prevailing copy now when
2058       // building the combined index, as a space optimization, however don't
2059       // rely on this optimization. The linker doesn't resolve local linkage
2060       // values so don't check whether those are prevailing.
2061       if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2062           !isPrevailing(VI.getGUID(), S.get()))
2063         continue;
2064       auto *FS = dyn_cast<FunctionSummary>(S.get());
2065       if (!FS)
2066         continue;
2067       std::vector<CallInfo> CallsWithMetadata;
2068       if (!FS->allocs().empty()) {
2069         for (auto &AN : FS->mutableAllocs()) {
2070           // This can happen because of recursion elimination handling that
2071           // currently exists in ModuleSummaryAnalysis. Skip these for now.
2072           // We still added them to the summary because we need to be able to
2073           // correlate properly in applyImport in the backends.
2074           if (AN.MIBs.empty())
2075             continue;
2076           IndexCall AllocCall(&AN);
2077           CallsWithMetadata.push_back(AllocCall);
2078           auto *AllocNode = addAllocNode(AllocCall, FS);
2079           // Pass an empty CallStack to the CallsiteContext (second)
2080           // parameter, since for ThinLTO we already collapsed out the inlined
2081           // stack ids on the allocation call during ModuleSummaryAnalysis.
2082           CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
2083               EmptyContext;
2084           unsigned I = 0;
2085           assert(
2086               (!MemProfReportHintedSizes && MinClonedColdBytePercent >= 100) ||
2087               AN.ContextSizeInfos.size() == AN.MIBs.size());
2088           // Now add all of the MIBs and their stack nodes.
2089           for (auto &MIB : AN.MIBs) {
2090             CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
2091                 StackContext(&MIB);
2092             std::vector<ContextTotalSize> ContextSizeInfo;
2093             if (!AN.ContextSizeInfos.empty()) {
2094               for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
2095                 ContextSizeInfo.push_back({FullStackId, TotalSize});
2096             }
2097             addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
2098                 AllocNode, StackContext, EmptyContext, MIB.AllocType,
2099                 ContextSizeInfo);
2100             I++;
2101           }
2102           assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2103           // Initialize version 0 on the summary alloc node to the current alloc
2104           // type, unless it has both types in which case make it default, so
2105           // that in the case where we aren't able to clone the original version
2106           // always ends up with the default allocation behavior.
2107           AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
2108         }
2109       }
2110       // For callsite metadata, add to list for this function for later use.
2111       if (!FS->callsites().empty())
2112         for (auto &SN : FS->mutableCallsites()) {
2113           IndexCall StackNodeCall(&SN);
2114           CallsWithMetadata.push_back(StackNodeCall);
2115         }
2116 
2117       if (!CallsWithMetadata.empty())
2118         FuncToCallsWithMetadata[FS] = CallsWithMetadata;
2119 
2120       if (!FS->allocs().empty() || !FS->callsites().empty())
2121         FSToVIMap[FS] = VI;
2122     }
2123   }
2124 
2125   if (DumpCCG) {
2126     dbgs() << "CCG before updating call stack chains:\n";
2127     dbgs() << *this;
2128   }
2129 
2130   if (ExportToDot)
2131     exportToDot("prestackupdate");
2132 
2133   updateStackNodes();
2134 
2135   handleCallsitesWithMultipleTargets();
2136 }
2137 
2138 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2139 void CallsiteContextGraph<DerivedCCG, FuncTy,
2140                           CallTy>::handleCallsitesWithMultipleTargets() {
2141   // Look for and workaround callsites that call multiple functions.
2142   // This can happen for indirect calls, which needs better handling, and in
2143   // more rare cases (e.g. macro expansion).
2144   // TODO: To fix this for indirect calls we will want to perform speculative
2145   // devirtualization using either the normal PGO info with ICP, or using the
2146   // information in the profiled MemProf contexts. We can do this prior to
2147   // this transformation for regular LTO, and for ThinLTO we can simulate that
2148   // effect in the summary and perform the actual speculative devirtualization
2149   // while cloning in the ThinLTO backend.
2150 
2151   // Keep track of the new nodes synthesized for discovered tail calls missing
2152   // from the profiled contexts.
2153   MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
2154 
2155   std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode;
2156   for (auto &Entry : NonAllocationCallToContextNodeMap) {
2157     auto *Node = Entry.second;
2158     assert(Node->Clones.empty());
2159     // Check all node callees and see if in the same function.
2160     // We need to check all of the calls recorded in this Node, because in some
2161     // cases we may have had multiple calls with the same debug info calling
2162     // different callees. This can happen, for example, when an object is
2163     // constructed in the paramter list - the destructor call of the object has
2164     // the same debug info (line/col) as the call the object was passed to.
2165     // Here we will prune any that don't match all callee nodes.
2166     std::vector<CallInfo> AllCalls;
2167     AllCalls.reserve(Node->MatchingCalls.size() + 1);
2168     AllCalls.push_back(Node->Call);
2169     AllCalls.insert(AllCalls.end(), Node->MatchingCalls.begin(),
2170                     Node->MatchingCalls.end());
2171 
2172     // First see if we can partition the calls by callee function, creating new
2173     // nodes to host each set of calls calling the same callees. This is
2174     // necessary for support indirect calls with ThinLTO, for which we
2175     // synthesized CallsiteInfo records for each target. They will all have the
2176     // same callsite stack ids and would be sharing a context node at this
2177     // point. We need to perform separate cloning for each, which will be
2178     // applied along with speculative devirtualization in the ThinLTO backends
2179     // as needed. Note this does not currently support looking through tail
2180     // calls, it is unclear if we need that for indirect call targets.
2181     // First partition calls by callee func. Map indexed by func, value is
2182     // struct with list of matching calls, assigned node.
2183     if (partitionCallsByCallee(Node, AllCalls, NewCallToNode))
2184       continue;
2185 
2186     auto It = AllCalls.begin();
2187     // Iterate through the calls until we find the first that matches.
2188     for (; It != AllCalls.end(); ++It) {
2189       auto ThisCall = *It;
2190       bool Match = true;
2191       for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
2192            ++EI) {
2193         auto Edge = *EI;
2194         if (!Edge->Callee->hasCall())
2195           continue;
2196         assert(NodeToCallingFunc.count(Edge->Callee));
2197         // Check if the called function matches that of the callee node.
2198         if (!calleesMatch(ThisCall.call(), EI, TailCallToContextNodeMap)) {
2199           Match = false;
2200           break;
2201         }
2202       }
2203       // Found a call that matches the callee nodes, we can quit now.
2204       if (Match) {
2205         // If the first match is not the primary call on the Node, update it
2206         // now. We will update the list of matching calls further below.
2207         if (Node->Call != ThisCall) {
2208           Node->setCall(ThisCall);
2209           // We need to update the NonAllocationCallToContextNodeMap, but don't
2210           // want to do this during iteration over that map, so save the calls
2211           // that need updated entries.
2212           NewCallToNode.push_back({ThisCall, Node});
2213         }
2214         break;
2215       }
2216     }
2217     // We will update this list below (or leave it cleared if there was no
2218     // match found above).
2219     Node->MatchingCalls.clear();
2220     // If we hit the end of the AllCalls vector, no call matching the callee
2221     // nodes was found, clear the call information in the node.
2222     if (It == AllCalls.end()) {
2223       RemovedEdgesWithMismatchedCallees++;
2224       // Work around by setting Node to have a null call, so it gets
2225       // skipped during cloning. Otherwise assignFunctions will assert
2226       // because its data structures are not designed to handle this case.
2227       Node->setCall(CallInfo());
2228       continue;
2229     }
2230     // Now add back any matching calls that call the same function as the
2231     // matching primary call on Node.
2232     for (++It; It != AllCalls.end(); ++It) {
2233       auto ThisCall = *It;
2234       if (!sameCallee(Node->Call.call(), ThisCall.call()))
2235         continue;
2236       Node->MatchingCalls.push_back(ThisCall);
2237     }
2238   }
2239 
2240   // Remove all mismatched nodes identified in the above loop from the node map
2241   // (checking whether they have a null call which is set above). For a
2242   // MapVector like NonAllocationCallToContextNodeMap it is much more efficient
2243   // to do the removal via remove_if than by individually erasing entries above.
2244   // Also remove any entries if we updated the node's primary call above.
2245   NonAllocationCallToContextNodeMap.remove_if([](const auto &it) {
2246     return !it.second->hasCall() || it.second->Call != it.first;
2247   });
2248 
2249   // Add entries for any new primary calls recorded above.
2250   for (auto &[Call, Node] : NewCallToNode)
2251     NonAllocationCallToContextNodeMap[Call] = Node;
2252 
2253   // Add the new nodes after the above loop so that the iteration is not
2254   // invalidated.
2255   for (auto &[Call, Node] : TailCallToContextNodeMap)
2256     NonAllocationCallToContextNodeMap[Call] = Node;
2257 }
2258 
2259 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2260 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee(
2261     ContextNode *Node, ArrayRef<CallInfo> AllCalls,
2262     std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) {
2263   // Struct to keep track of all the calls having the same callee function,
2264   // and the node we eventually assign to them. Eventually we will record the
2265   // context node assigned to this group of calls.
2266   struct CallsWithSameCallee {
2267     std::vector<CallInfo> Calls;
2268     ContextNode *Node = nullptr;
2269   };
2270 
2271   // First partition calls by callee function. Build map from each function
2272   // to the list of matching calls.
2273   DenseMap<const FuncTy *, CallsWithSameCallee> CalleeFuncToCallInfo;
2274   for (auto ThisCall : AllCalls) {
2275     auto *F = getCalleeFunc(ThisCall.call());
2276     if (F)
2277       CalleeFuncToCallInfo[F].Calls.push_back(ThisCall);
2278   }
2279 
2280   // Next, walk through all callee edges. For each callee node, get its
2281   // containing function and see if it was recorded in the above map (meaning we
2282   // have at least one matching call). Build another map from each callee node
2283   // with a matching call to the structure instance created above containing all
2284   // the calls.
2285   DenseMap<ContextNode *, CallsWithSameCallee *> CalleeNodeToCallInfo;
2286   for (const auto &Edge : Node->CalleeEdges) {
2287     if (!Edge->Callee->hasCall())
2288       continue;
2289     const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2290     if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc))
2291       CalleeNodeToCallInfo[Edge->Callee] =
2292           &CalleeFuncToCallInfo[ProfiledCalleeFunc];
2293   }
2294 
2295   // If there are entries in the second map, then there were no matching
2296   // calls/callees, nothing to do here. Return so we can go to the handling that
2297   // looks through tail calls.
2298   if (CalleeNodeToCallInfo.empty())
2299     return false;
2300 
2301   // Walk through all callee edges again. Any and all callee edges that didn't
2302   // match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a
2303   // new caller node (UnmatchedCalleesNode) which gets a null call so that it is
2304   // ignored during cloning. If it is in the map, then we use the node recorded
2305   // in that entry (creating it if needed), and move the callee edge to it.
2306   // The first callee will use the original node instead of creating a new one.
2307   // Note that any of the original calls on this node (in AllCalls) that didn't
2308   // have a callee function automatically get dropped from the node as part of
2309   // this process.
2310   ContextNode *UnmatchedCalleesNode = nullptr;
2311   // Track whether we already assigned original node to a callee.
2312   bool UsedOrigNode = false;
2313   assert(NodeToCallingFunc[Node]);
2314   // Iterate over a copy of Node's callee edges, since we may need to remove
2315   // edges in moveCalleeEdgeToNewCaller, and this simplifies the handling and
2316   // makes it less error-prone.
2317   auto CalleeEdges = Node->CalleeEdges;
2318   for (auto &Edge : CalleeEdges) {
2319     if (!Edge->Callee->hasCall())
2320       continue;
2321 
2322     // Will be updated below to point to whatever (caller) node this callee edge
2323     // should be moved to.
2324     ContextNode *CallerNodeToUse = nullptr;
2325 
2326     // Handle the case where there were no matching calls first. Move this
2327     // callee edge to the UnmatchedCalleesNode, creating it if needed.
2328     if (!CalleeNodeToCallInfo.contains(Edge->Callee)) {
2329       if (!UnmatchedCalleesNode)
2330         UnmatchedCalleesNode =
2331             createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2332       CallerNodeToUse = UnmatchedCalleesNode;
2333     } else {
2334       // Look up the information recorded for this callee node, and use the
2335       // recorded caller node (creating it if needed).
2336       auto *Info = CalleeNodeToCallInfo[Edge->Callee];
2337       if (!Info->Node) {
2338         // If we haven't assigned any callees to the original node use it.
2339         if (!UsedOrigNode) {
2340           Info->Node = Node;
2341           // Clear the set of matching calls which will be updated below.
2342           Node->MatchingCalls.clear();
2343           UsedOrigNode = true;
2344         } else
2345           Info->Node =
2346               createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2347         assert(!Info->Calls.empty());
2348         // The first call becomes the primary call for this caller node, and the
2349         // rest go in the matching calls list.
2350         Info->Node->setCall(Info->Calls.front());
2351         Info->Node->MatchingCalls.insert(Info->Node->MatchingCalls.end(),
2352                                          Info->Calls.begin() + 1,
2353                                          Info->Calls.end());
2354         // Save the primary call to node correspondence so that we can update
2355         // the NonAllocationCallToContextNodeMap, which is being iterated in the
2356         // caller of this function.
2357         NewCallToNode.push_back({Info->Node->Call, Info->Node});
2358       }
2359       CallerNodeToUse = Info->Node;
2360     }
2361 
2362     // Don't need to move edge if we are using the original node;
2363     if (CallerNodeToUse == Node)
2364       continue;
2365 
2366     moveCalleeEdgeToNewCaller(Edge, CallerNodeToUse);
2367   }
2368   // Now that we are done moving edges, clean up any caller edges that ended
2369   // up with no type or context ids. During moveCalleeEdgeToNewCaller all
2370   // caller edges from Node are replicated onto the new callers, and it
2371   // simplifies the handling to leave them until we have moved all
2372   // edges/context ids.
2373   for (auto &I : CalleeNodeToCallInfo)
2374     removeNoneTypeCallerEdges(I.second->Node);
2375   if (UnmatchedCalleesNode)
2376     removeNoneTypeCallerEdges(UnmatchedCalleesNode);
2377   removeNoneTypeCallerEdges(Node);
2378 
2379   return true;
2380 }
2381 
2382 uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2383   // In the Module (IR) case this is already the Id.
2384   return IdOrIndex;
2385 }
2386 
2387 uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2388   // In the Index case this is an index into the stack id list in the summary
2389   // index, convert it to an Id.
2390   return Index.getStackIdAtIndex(IdOrIndex);
2391 }
2392 
2393 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2394 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
2395     CallTy Call, EdgeIter &EI,
2396     MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
2397   auto Edge = *EI;
2398   const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2399   const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
2400   // Will be populated in order of callee to caller if we find a chain of tail
2401   // calls between the profiled caller and callee.
2402   std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
2403   if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
2404                          FoundCalleeChain))
2405     return false;
2406 
2407   // The usual case where the profiled callee matches that of the IR/summary.
2408   if (FoundCalleeChain.empty())
2409     return true;
2410 
2411   auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
2412     auto *CurEdge = Callee->findEdgeFromCaller(Caller);
2413     // If there is already an edge between these nodes, simply update it and
2414     // return.
2415     if (CurEdge) {
2416       CurEdge->ContextIds.insert(Edge->ContextIds.begin(),
2417                                  Edge->ContextIds.end());
2418       CurEdge->AllocTypes |= Edge->AllocTypes;
2419       return;
2420     }
2421     // Otherwise, create a new edge and insert it into the caller and callee
2422     // lists.
2423     auto NewEdge = std::make_shared<ContextEdge>(
2424         Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
2425     Callee->CallerEdges.push_back(NewEdge);
2426     if (Caller == Edge->Caller) {
2427       // If we are inserting the new edge into the current edge's caller, insert
2428       // the new edge before the current iterator position, and then increment
2429       // back to the current edge.
2430       EI = Caller->CalleeEdges.insert(EI, NewEdge);
2431       ++EI;
2432       assert(*EI == Edge &&
2433              "Iterator position not restored after insert and increment");
2434     } else
2435       Caller->CalleeEdges.push_back(NewEdge);
2436   };
2437 
2438   // Create new nodes for each found callee and connect in between the profiled
2439   // caller and callee.
2440   auto *CurCalleeNode = Edge->Callee;
2441   for (auto &[NewCall, Func] : FoundCalleeChain) {
2442     ContextNode *NewNode = nullptr;
2443     // First check if we have already synthesized a node for this tail call.
2444     if (TailCallToContextNodeMap.count(NewCall)) {
2445       NewNode = TailCallToContextNodeMap[NewCall];
2446       NewNode->AllocTypes |= Edge->AllocTypes;
2447     } else {
2448       FuncToCallsWithMetadata[Func].push_back({NewCall});
2449       // Create Node and record node info.
2450       NewNode = createNewNode(/*IsAllocation=*/false, Func, NewCall);
2451       TailCallToContextNodeMap[NewCall] = NewNode;
2452       NewNode->AllocTypes = Edge->AllocTypes;
2453     }
2454 
2455     // Hook up node to its callee node
2456     AddEdge(NewNode, CurCalleeNode);
2457 
2458     CurCalleeNode = NewNode;
2459   }
2460 
2461   // Hook up edge's original caller to new callee node.
2462   AddEdge(Edge->Caller, CurCalleeNode);
2463 
2464 #ifndef NDEBUG
2465   // Save this because Edge's fields get cleared below when removed.
2466   auto *Caller = Edge->Caller;
2467 #endif
2468 
2469   // Remove old edge
2470   removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
2471 
2472   // To simplify the increment of EI in the caller, subtract one from EI.
2473   // In the final AddEdge call we would have either added a new callee edge,
2474   // to Edge->Caller, or found an existing one. Either way we are guaranteed
2475   // that there is at least one callee edge.
2476   assert(!Caller->CalleeEdges.empty());
2477   --EI;
2478 
2479   return true;
2480 }
2481 
2482 bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2483     const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
2484     std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
2485     bool &FoundMultipleCalleeChains) {
2486   // Stop recursive search if we have already explored the maximum specified
2487   // depth.
2488   if (Depth > TailCallSearchDepth)
2489     return false;
2490 
2491   auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
2492     FoundCalleeChain.push_back({Callsite, F});
2493   };
2494 
2495   auto *CalleeFunc = dyn_cast<Function>(CurCallee);
2496   if (!CalleeFunc) {
2497     auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
2498     assert(Alias);
2499     CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
2500     assert(CalleeFunc);
2501   }
2502 
2503   // Look for tail calls in this function, and check if they either call the
2504   // profiled callee directly, or indirectly (via a recursive search).
2505   // Only succeed if there is a single unique tail call chain found between the
2506   // profiled caller and callee, otherwise we could perform incorrect cloning.
2507   bool FoundSingleCalleeChain = false;
2508   for (auto &BB : *CalleeFunc) {
2509     for (auto &I : BB) {
2510       auto *CB = dyn_cast<CallBase>(&I);
2511       if (!CB || !CB->isTailCall())
2512         continue;
2513       auto *CalledValue = CB->getCalledOperand();
2514       auto *CalledFunction = CB->getCalledFunction();
2515       if (CalledValue && !CalledFunction) {
2516         CalledValue = CalledValue->stripPointerCasts();
2517         // Stripping pointer casts can reveal a called function.
2518         CalledFunction = dyn_cast<Function>(CalledValue);
2519       }
2520       // Check if this is an alias to a function. If so, get the
2521       // called aliasee for the checks below.
2522       if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
2523         assert(!CalledFunction &&
2524                "Expected null called function in callsite for alias");
2525         CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
2526       }
2527       if (!CalledFunction)
2528         continue;
2529       if (CalledFunction == ProfiledCallee) {
2530         if (FoundSingleCalleeChain) {
2531           FoundMultipleCalleeChains = true;
2532           return false;
2533         }
2534         FoundSingleCalleeChain = true;
2535         FoundProfiledCalleeCount++;
2536         FoundProfiledCalleeDepth += Depth;
2537         if (Depth > FoundProfiledCalleeMaxDepth)
2538           FoundProfiledCalleeMaxDepth = Depth;
2539         SaveCallsiteInfo(&I, CalleeFunc);
2540       } else if (findProfiledCalleeThroughTailCalls(
2541                      ProfiledCallee, CalledFunction, Depth + 1,
2542                      FoundCalleeChain, FoundMultipleCalleeChains)) {
2543         // findProfiledCalleeThroughTailCalls should not have returned
2544         // true if FoundMultipleCalleeChains.
2545         assert(!FoundMultipleCalleeChains);
2546         if (FoundSingleCalleeChain) {
2547           FoundMultipleCalleeChains = true;
2548           return false;
2549         }
2550         FoundSingleCalleeChain = true;
2551         SaveCallsiteInfo(&I, CalleeFunc);
2552       } else if (FoundMultipleCalleeChains)
2553         return false;
2554     }
2555   }
2556 
2557   return FoundSingleCalleeChain;
2558 }
2559 
2560 const Function *ModuleCallsiteContextGraph::getCalleeFunc(Instruction *Call) {
2561   auto *CB = dyn_cast<CallBase>(Call);
2562   if (!CB->getCalledOperand() || CB->isIndirectCall())
2563     return nullptr;
2564   auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2565   auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2566   if (Alias)
2567     return dyn_cast<Function>(Alias->getAliasee());
2568   return dyn_cast<Function>(CalleeVal);
2569 }
2570 
2571 bool ModuleCallsiteContextGraph::calleeMatchesFunc(
2572     Instruction *Call, const Function *Func, const Function *CallerFunc,
2573     std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
2574   auto *CB = dyn_cast<CallBase>(Call);
2575   if (!CB->getCalledOperand() || CB->isIndirectCall())
2576     return false;
2577   auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2578   auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
2579   if (CalleeFunc == Func)
2580     return true;
2581   auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2582   if (Alias && Alias->getAliasee() == Func)
2583     return true;
2584 
2585   // Recursively search for the profiled callee through tail calls starting with
2586   // the actual Callee. The discovered tail call chain is saved in
2587   // FoundCalleeChain, and we will fixup the graph to include these callsites
2588   // after returning.
2589   // FIXME: We will currently redo the same recursive walk if we find the same
2590   // mismatched callee from another callsite. We can improve this with more
2591   // bookkeeping of the created chain of new nodes for each mismatch.
2592   unsigned Depth = 1;
2593   bool FoundMultipleCalleeChains = false;
2594   if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
2595                                           FoundCalleeChain,
2596                                           FoundMultipleCalleeChains)) {
2597     LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
2598                       << Func->getName() << " from " << CallerFunc->getName()
2599                       << " that actually called " << CalleeVal->getName()
2600                       << (FoundMultipleCalleeChains
2601                               ? " (found multiple possible chains)"
2602                               : "")
2603                       << "\n");
2604     if (FoundMultipleCalleeChains)
2605       FoundProfiledCalleeNonUniquelyCount++;
2606     return false;
2607   }
2608 
2609   return true;
2610 }
2611 
2612 bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1,
2613                                             Instruction *Call2) {
2614   auto *CB1 = cast<CallBase>(Call1);
2615   if (!CB1->getCalledOperand() || CB1->isIndirectCall())
2616     return false;
2617   auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts();
2618   auto *CalleeFunc1 = dyn_cast<Function>(CalleeVal1);
2619   auto *CB2 = cast<CallBase>(Call2);
2620   if (!CB2->getCalledOperand() || CB2->isIndirectCall())
2621     return false;
2622   auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts();
2623   auto *CalleeFunc2 = dyn_cast<Function>(CalleeVal2);
2624   return CalleeFunc1 == CalleeFunc2;
2625 }
2626 
2627 bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2628     ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
2629     std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
2630     bool &FoundMultipleCalleeChains) {
2631   // Stop recursive search if we have already explored the maximum specified
2632   // depth.
2633   if (Depth > TailCallSearchDepth)
2634     return false;
2635 
2636   auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
2637     // Make a CallsiteInfo for each discovered callee, if one hasn't already
2638     // been synthesized.
2639     if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
2640         !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
2641       // StackIds is empty (we don't have debug info available in the index for
2642       // these callsites)
2643       FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
2644           std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
2645     CallsiteInfo *NewCallsiteInfo =
2646         FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
2647     FoundCalleeChain.push_back({NewCallsiteInfo, FS});
2648   };
2649 
2650   // Look for tail calls in this function, and check if they either call the
2651   // profiled callee directly, or indirectly (via a recursive search).
2652   // Only succeed if there is a single unique tail call chain found between the
2653   // profiled caller and callee, otherwise we could perform incorrect cloning.
2654   bool FoundSingleCalleeChain = false;
2655   for (auto &S : CurCallee.getSummaryList()) {
2656     if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2657         !isPrevailing(CurCallee.getGUID(), S.get()))
2658       continue;
2659     auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
2660     if (!FS)
2661       continue;
2662     auto FSVI = CurCallee;
2663     auto *AS = dyn_cast<AliasSummary>(S.get());
2664     if (AS)
2665       FSVI = AS->getAliaseeVI();
2666     for (auto &CallEdge : FS->calls()) {
2667       if (!CallEdge.second.hasTailCall())
2668         continue;
2669       if (CallEdge.first == ProfiledCallee) {
2670         if (FoundSingleCalleeChain) {
2671           FoundMultipleCalleeChains = true;
2672           return false;
2673         }
2674         FoundSingleCalleeChain = true;
2675         FoundProfiledCalleeCount++;
2676         FoundProfiledCalleeDepth += Depth;
2677         if (Depth > FoundProfiledCalleeMaxDepth)
2678           FoundProfiledCalleeMaxDepth = Depth;
2679         CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2680         // Add FS to FSToVIMap  in case it isn't already there.
2681         assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2682         FSToVIMap[FS] = FSVI;
2683       } else if (findProfiledCalleeThroughTailCalls(
2684                      ProfiledCallee, CallEdge.first, Depth + 1,
2685                      FoundCalleeChain, FoundMultipleCalleeChains)) {
2686         // findProfiledCalleeThroughTailCalls should not have returned
2687         // true if FoundMultipleCalleeChains.
2688         assert(!FoundMultipleCalleeChains);
2689         if (FoundSingleCalleeChain) {
2690           FoundMultipleCalleeChains = true;
2691           return false;
2692         }
2693         FoundSingleCalleeChain = true;
2694         CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2695         // Add FS to FSToVIMap  in case it isn't already there.
2696         assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2697         FSToVIMap[FS] = FSVI;
2698       } else if (FoundMultipleCalleeChains)
2699         return false;
2700     }
2701   }
2702 
2703   return FoundSingleCalleeChain;
2704 }
2705 
2706 const FunctionSummary *
2707 IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) {
2708   ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
2709   if (Callee.getSummaryList().empty())
2710     return nullptr;
2711   return dyn_cast<FunctionSummary>(Callee.getSummaryList()[0]->getBaseObject());
2712 }
2713 
2714 bool IndexCallsiteContextGraph::calleeMatchesFunc(
2715     IndexCall &Call, const FunctionSummary *Func,
2716     const FunctionSummary *CallerFunc,
2717     std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
2718   ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
2719   // If there is no summary list then this is a call to an externally defined
2720   // symbol.
2721   AliasSummary *Alias =
2722       Callee.getSummaryList().empty()
2723           ? nullptr
2724           : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
2725   assert(FSToVIMap.count(Func));
2726   auto FuncVI = FSToVIMap[Func];
2727   if (Callee == FuncVI ||
2728       // If callee is an alias, check the aliasee, since only function
2729       // summary base objects will contain the stack node summaries and thus
2730       // get a context node.
2731       (Alias && Alias->getAliaseeVI() == FuncVI))
2732     return true;
2733 
2734   // Recursively search for the profiled callee through tail calls starting with
2735   // the actual Callee. The discovered tail call chain is saved in
2736   // FoundCalleeChain, and we will fixup the graph to include these callsites
2737   // after returning.
2738   // FIXME: We will currently redo the same recursive walk if we find the same
2739   // mismatched callee from another callsite. We can improve this with more
2740   // bookkeeping of the created chain of new nodes for each mismatch.
2741   unsigned Depth = 1;
2742   bool FoundMultipleCalleeChains = false;
2743   if (!findProfiledCalleeThroughTailCalls(
2744           FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
2745     LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
2746                       << " from " << FSToVIMap[CallerFunc]
2747                       << " that actually called " << Callee
2748                       << (FoundMultipleCalleeChains
2749                               ? " (found multiple possible chains)"
2750                               : "")
2751                       << "\n");
2752     if (FoundMultipleCalleeChains)
2753       FoundProfiledCalleeNonUniquelyCount++;
2754     return false;
2755   }
2756 
2757   return true;
2758 }
2759 
2760 bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) {
2761   ValueInfo Callee1 = dyn_cast_if_present<CallsiteInfo *>(Call1)->Callee;
2762   ValueInfo Callee2 = dyn_cast_if_present<CallsiteInfo *>(Call2)->Callee;
2763   return Callee1 == Callee2;
2764 }
2765 
2766 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2767 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
2768     const {
2769   print(dbgs());
2770   dbgs() << "\n";
2771 }
2772 
2773 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2774 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
2775     raw_ostream &OS) const {
2776   OS << "Node " << this << "\n";
2777   OS << "\t";
2778   printCall(OS);
2779   if (Recursive)
2780     OS << " (recursive)";
2781   OS << "\n";
2782   if (!MatchingCalls.empty()) {
2783     OS << "\tMatchingCalls:\n";
2784     for (auto &MatchingCall : MatchingCalls) {
2785       OS << "\t";
2786       MatchingCall.print(OS);
2787       OS << "\n";
2788     }
2789   }
2790   OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
2791   OS << "\tContextIds:";
2792   // Make a copy of the computed context ids that we can sort for stability.
2793   auto ContextIds = getContextIds();
2794   std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2795   std::sort(SortedIds.begin(), SortedIds.end());
2796   for (auto Id : SortedIds)
2797     OS << " " << Id;
2798   OS << "\n";
2799   OS << "\tCalleeEdges:\n";
2800   for (auto &Edge : CalleeEdges)
2801     OS << "\t\t" << *Edge << "\n";
2802   OS << "\tCallerEdges:\n";
2803   for (auto &Edge : CallerEdges)
2804     OS << "\t\t" << *Edge << "\n";
2805   if (!Clones.empty()) {
2806     OS << "\tClones: ";
2807     ListSeparator LS;
2808     for (auto *Clone : Clones)
2809       OS << LS << Clone;
2810     OS << "\n";
2811   } else if (CloneOf) {
2812     OS << "\tClone of " << CloneOf << "\n";
2813   }
2814 }
2815 
2816 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2817 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
2818     const {
2819   print(dbgs());
2820   dbgs() << "\n";
2821 }
2822 
2823 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2824 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
2825     raw_ostream &OS) const {
2826   OS << "Edge from Callee " << Callee << " to Caller: " << Caller
2827      << " AllocTypes: " << getAllocTypeString(AllocTypes);
2828   OS << " ContextIds:";
2829   std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2830   std::sort(SortedIds.begin(), SortedIds.end());
2831   for (auto Id : SortedIds)
2832     OS << " " << Id;
2833 }
2834 
2835 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2836 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
2837   print(dbgs());
2838 }
2839 
2840 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2841 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
2842     raw_ostream &OS) const {
2843   OS << "Callsite Context Graph:\n";
2844   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2845   for (const auto Node : nodes<GraphType>(this)) {
2846     if (Node->isRemoved())
2847       continue;
2848     Node->print(OS);
2849     OS << "\n";
2850   }
2851 }
2852 
2853 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2854 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
2855     raw_ostream &OS) const {
2856   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2857   for (const auto Node : nodes<GraphType>(this)) {
2858     if (Node->isRemoved())
2859       continue;
2860     if (!Node->IsAllocation)
2861       continue;
2862     DenseSet<uint32_t> ContextIds = Node->getContextIds();
2863     auto AllocTypeFromCall = getAllocationCallType(Node->Call);
2864     std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2865     std::sort(SortedIds.begin(), SortedIds.end());
2866     for (auto Id : SortedIds) {
2867       auto TypeI = ContextIdToAllocationType.find(Id);
2868       assert(TypeI != ContextIdToAllocationType.end());
2869       auto CSI = ContextIdToContextSizeInfos.find(Id);
2870       if (CSI != ContextIdToContextSizeInfos.end()) {
2871         for (auto &Info : CSI->second) {
2872           OS << "MemProf hinting: "
2873              << getAllocTypeString((uint8_t)TypeI->second)
2874              << " full allocation context " << Info.FullStackId
2875              << " with total size " << Info.TotalSize << " is "
2876              << getAllocTypeString(Node->AllocTypes) << " after cloning";
2877           if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
2878             OS << " marked " << getAllocTypeString((uint8_t)AllocTypeFromCall)
2879                << " due to cold byte percent";
2880           OS << "\n";
2881         }
2882       }
2883     }
2884   }
2885 }
2886 
2887 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2888 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
2889   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2890   for (const auto Node : nodes<GraphType>(this)) {
2891     checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2892     for (auto &Edge : Node->CallerEdges)
2893       checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2894   }
2895 }
2896 
2897 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2898 struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
2899   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2900   using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
2901 
2902   using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
2903   static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
2904 
2905   using nodes_iterator =
2906       mapped_iterator<typename std::vector<NodePtrTy>::const_iterator,
2907                       decltype(&getNode)>;
2908 
2909   static nodes_iterator nodes_begin(GraphType G) {
2910     return nodes_iterator(G->NodeOwner.begin(), &getNode);
2911   }
2912 
2913   static nodes_iterator nodes_end(GraphType G) {
2914     return nodes_iterator(G->NodeOwner.end(), &getNode);
2915   }
2916 
2917   static NodeRef getEntryNode(GraphType G) {
2918     return G->NodeOwner.begin()->get();
2919   }
2920 
2921   using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
2922   static const ContextNode<DerivedCCG, FuncTy, CallTy> *
2923   GetCallee(const EdgePtrTy &P) {
2924     return P->Callee;
2925   }
2926 
2927   using ChildIteratorType =
2928       mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
2929                           DerivedCCG, FuncTy, CallTy>>>::const_iterator,
2930                       decltype(&GetCallee)>;
2931 
2932   static ChildIteratorType child_begin(NodeRef N) {
2933     return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
2934   }
2935 
2936   static ChildIteratorType child_end(NodeRef N) {
2937     return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
2938   }
2939 };
2940 
2941 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2942 struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
2943     : public DefaultDOTGraphTraits {
2944   DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
2945 
2946   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2947   using GTraits = GraphTraits<GraphType>;
2948   using NodeRef = typename GTraits::NodeRef;
2949   using ChildIteratorType = typename GTraits::ChildIteratorType;
2950 
2951   static std::string getNodeLabel(NodeRef Node, GraphType G) {
2952     std::string LabelString =
2953         (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
2954          Twine(Node->OrigStackOrAllocId))
2955             .str();
2956     LabelString += "\n";
2957     if (Node->hasCall()) {
2958       auto Func = G->NodeToCallingFunc.find(Node);
2959       assert(Func != G->NodeToCallingFunc.end());
2960       LabelString +=
2961           G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
2962     } else {
2963       LabelString += "null call";
2964       if (Node->Recursive)
2965         LabelString += " (recursive)";
2966       else
2967         LabelString += " (external)";
2968     }
2969     return LabelString;
2970   }
2971 
2972   static std::string getNodeAttributes(NodeRef Node, GraphType) {
2973     std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
2974                                    getContextIds(Node->getContextIds()) + "\"")
2975                                       .str();
2976     AttributeString +=
2977         (Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str();
2978     AttributeString += ",style=\"filled\"";
2979     if (Node->CloneOf) {
2980       AttributeString += ",color=\"blue\"";
2981       AttributeString += ",style=\"filled,bold,dashed\"";
2982     } else
2983       AttributeString += ",style=\"filled\"";
2984     return AttributeString;
2985   }
2986 
2987   static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
2988                                        GraphType) {
2989     auto &Edge = *(ChildIter.getCurrent());
2990     return (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
2991             Twine(",fillcolor=\"") + getColor(Edge->AllocTypes) + "\"")
2992         .str();
2993   }
2994 
2995   // Since the NodeOwners list includes nodes that are no longer connected to
2996   // the graph, skip them here.
2997   static bool isNodeHidden(NodeRef Node, GraphType) {
2998     return Node->isRemoved();
2999   }
3000 
3001 private:
3002   static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
3003     std::string IdString = "ContextIds:";
3004     if (ContextIds.size() < 100) {
3005       std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3006       std::sort(SortedIds.begin(), SortedIds.end());
3007       for (auto Id : SortedIds)
3008         IdString += (" " + Twine(Id)).str();
3009     } else {
3010       IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
3011     }
3012     return IdString;
3013   }
3014 
3015   static std::string getColor(uint8_t AllocTypes) {
3016     if (AllocTypes == (uint8_t)AllocationType::NotCold)
3017       // Color "brown1" actually looks like a lighter red.
3018       return "brown1";
3019     if (AllocTypes == (uint8_t)AllocationType::Cold)
3020       return "cyan";
3021     if (AllocTypes ==
3022         ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
3023       // Lighter purple.
3024       return "mediumorchid1";
3025     return "gray";
3026   }
3027 
3028   static std::string getNodeId(NodeRef Node) {
3029     std::stringstream SStream;
3030     SStream << std::hex << "N0x" << (unsigned long long)Node;
3031     std::string Result = SStream.str();
3032     return Result;
3033   }
3034 };
3035 
3036 template <typename DerivedCCG, typename FuncTy, typename CallTy>
3037 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
3038     std::string Label) const {
3039   WriteGraph(this, "", false, Label,
3040              DotFilePathPrefix + "ccg." + Label + ".dot");
3041 }
3042 
3043 template <typename DerivedCCG, typename FuncTy, typename CallTy>
3044 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
3045 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
3046     const std::shared_ptr<ContextEdge> &Edge,
3047     DenseSet<uint32_t> ContextIdsToMove) {
3048   ContextNode *Node = Edge->Callee;
3049   assert(NodeToCallingFunc.count(Node));
3050   ContextNode *Clone =
3051       createNewNode(Node->IsAllocation, NodeToCallingFunc[Node], Node->Call);
3052   Node->addClone(Clone);
3053   Clone->MatchingCalls = Node->MatchingCalls;
3054   moveEdgeToExistingCalleeClone(Edge, Clone, /*NewClone=*/true,
3055                                 ContextIdsToMove);
3056   return Clone;
3057 }
3058 
3059 template <typename DerivedCCG, typename FuncTy, typename CallTy>
3060 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3061     moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
3062                                   ContextNode *NewCallee, bool NewClone,
3063                                   DenseSet<uint32_t> ContextIdsToMove) {
3064   // NewCallee and Edge's current callee must be clones of the same original
3065   // node (Edge's current callee may be the original node too).
3066   assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
3067 
3068   ContextNode *OldCallee = Edge->Callee;
3069 
3070   // We might already have an edge to the new callee from earlier cloning for a
3071   // different allocation. If one exists we will reuse it.
3072   auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
3073 
3074   // Callers will pass an empty ContextIdsToMove set when they want to move the
3075   // edge. Copy in Edge's ids for simplicity.
3076   if (ContextIdsToMove.empty())
3077     ContextIdsToMove = Edge->getContextIds();
3078 
3079   // If we are moving all of Edge's ids, then just move the whole Edge.
3080   // Otherwise only move the specified subset, to a new edge if needed.
3081   if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
3082     // First, update the alloc types on New Callee from Edge.
3083     // Do this before we potentially clear Edge's fields below!
3084     NewCallee->AllocTypes |= Edge->AllocTypes;
3085     // Moving the whole Edge.
3086     if (ExistingEdgeToNewCallee) {
3087       // Since we already have an edge to NewCallee, simply move the ids
3088       // onto it, and remove the existing Edge.
3089       ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
3090                                                       ContextIdsToMove.end());
3091       ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
3092       assert(Edge->ContextIds == ContextIdsToMove);
3093       removeEdgeFromGraph(Edge.get());
3094     } else {
3095       // Otherwise just reconnect Edge to NewCallee.
3096       Edge->Callee = NewCallee;
3097       NewCallee->CallerEdges.push_back(Edge);
3098       // Remove it from callee where it was previously connected.
3099       OldCallee->eraseCallerEdge(Edge.get());
3100       // Don't need to update Edge's context ids since we are simply
3101       // reconnecting it.
3102     }
3103   } else {
3104     // Only moving a subset of Edge's ids.
3105     // Compute the alloc type of the subset of ids being moved.
3106     auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove);
3107     if (ExistingEdgeToNewCallee) {
3108       // Since we already have an edge to NewCallee, simply move the ids
3109       // onto it.
3110       ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
3111                                                       ContextIdsToMove.end());
3112       ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
3113     } else {
3114       // Otherwise, create a new edge to NewCallee for the ids being moved.
3115       auto NewEdge = std::make_shared<ContextEdge>(
3116           NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
3117       Edge->Caller->CalleeEdges.push_back(NewEdge);
3118       NewCallee->CallerEdges.push_back(NewEdge);
3119     }
3120     // In either case, need to update the alloc types on NewCallee, and remove
3121     // those ids and update the alloc type on the original Edge.
3122     NewCallee->AllocTypes |= CallerEdgeAllocType;
3123     set_subtract(Edge->ContextIds, ContextIdsToMove);
3124     Edge->AllocTypes = computeAllocType(Edge->ContextIds);
3125   }
3126   // Now walk the old callee node's callee edges and move Edge's context ids
3127   // over to the corresponding edge into the clone (which is created here if
3128   // this is a newly created clone).
3129   for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
3130     // The context ids moving to the new callee are the subset of this edge's
3131     // context ids and the context ids on the caller edge being moved.
3132     DenseSet<uint32_t> EdgeContextIdsToMove =
3133         set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
3134     set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
3135     OldCalleeEdge->AllocTypes =
3136         computeAllocType(OldCalleeEdge->getContextIds());
3137     if (!NewClone) {
3138       // Update context ids / alloc type on corresponding edge to NewCallee.
3139       // There is a chance this may not exist if we are reusing an existing
3140       // clone, specifically during function assignment, where we would have
3141       // removed none type edges after creating the clone. If we can't find
3142       // a corresponding edge there, fall through to the cloning below.
3143       if (auto *NewCalleeEdge =
3144               NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) {
3145         NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
3146                                               EdgeContextIdsToMove.end());
3147         NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
3148         continue;
3149       }
3150     }
3151     auto NewEdge = std::make_shared<ContextEdge>(
3152         OldCalleeEdge->Callee, NewCallee,
3153         computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);
3154     NewCallee->CalleeEdges.push_back(NewEdge);
3155     NewEdge->Callee->CallerEdges.push_back(NewEdge);
3156   }
3157   // Recompute the node alloc type now that its callee edges have been
3158   // updated (since we will compute from those edges).
3159   OldCallee->AllocTypes = OldCallee->computeAllocType();
3160   // OldCallee alloc type should be None iff its context id set is now empty.
3161   assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
3162          OldCallee->emptyContextIds());
3163   if (VerifyCCG) {
3164     checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
3165     checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
3166     for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
3167       checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
3168                                             /*CheckEdges=*/false);
3169     for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
3170       checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
3171                                             /*CheckEdges=*/false);
3172   }
3173 }
3174 
3175 template <typename DerivedCCG, typename FuncTy, typename CallTy>
3176 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3177     moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
3178                               ContextNode *NewCaller) {
3179 
3180   ContextNode *OldCaller = Edge->Caller;
3181   OldCaller->eraseCalleeEdge(Edge.get());
3182 
3183   // We might already have an edge to the new caller. If one exists we will
3184   // reuse it.
3185   auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(Edge->Callee);
3186 
3187   if (ExistingEdgeToNewCaller) {
3188     // Since we already have an edge to NewCaller, simply move the ids
3189     // onto it, and remove the existing Edge.
3190     ExistingEdgeToNewCaller->getContextIds().insert(
3191         Edge->getContextIds().begin(), Edge->getContextIds().end());
3192     ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes;
3193     Edge->ContextIds.clear();
3194     Edge->AllocTypes = (uint8_t)AllocationType::None;
3195     Edge->Callee->eraseCallerEdge(Edge.get());
3196   } else {
3197     // Otherwise just reconnect Edge to NewCaller.
3198     Edge->Caller = NewCaller;
3199     NewCaller->CalleeEdges.push_back(Edge);
3200     // Don't need to update Edge's context ids since we are simply
3201     // reconnecting it.
3202   }
3203   // In either case, need to update the alloc types on New Caller.
3204   NewCaller->AllocTypes |= Edge->AllocTypes;
3205 
3206   // Now walk the old caller node's caller edges and move Edge's context ids
3207   // over to the corresponding edge into the node (which is created here if
3208   // this is a newly created node). We can tell whether this is a newly created
3209   // node by seeing if it has any caller edges yet.
3210 #ifndef NDEBUG
3211   bool IsNewNode = NewCaller->CallerEdges.empty();
3212 #endif
3213   for (auto &OldCallerEdge : OldCaller->CallerEdges) {
3214     // The context ids moving to the new caller are the subset of this edge's
3215     // context ids and the context ids on the callee edge being moved.
3216     DenseSet<uint32_t> EdgeContextIdsToMove =
3217         set_intersection(OldCallerEdge->getContextIds(), Edge->getContextIds());
3218     set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
3219     OldCallerEdge->AllocTypes =
3220         computeAllocType(OldCallerEdge->getContextIds());
3221     // In this function we expect that any pre-existing node already has edges
3222     // from the same callers as the old node. That should be true in the current
3223     // use case, where we will remove None-type edges after copying over all
3224     // caller edges from the callee.
3225     auto *ExistingCallerEdge =
3226         NewCaller->findEdgeFromCaller(OldCallerEdge->Caller);
3227     assert(IsNewNode || ExistingCallerEdge);
3228     if (ExistingCallerEdge) {
3229       ExistingCallerEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
3230                                                  EdgeContextIdsToMove.end());
3231       ExistingCallerEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
3232       continue;
3233     }
3234     auto NewEdge = std::make_shared<ContextEdge>(
3235         NewCaller, OldCallerEdge->Caller,
3236         computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);
3237     NewCaller->CallerEdges.push_back(NewEdge);
3238     NewEdge->Caller->CalleeEdges.push_back(NewEdge);
3239   }
3240   // Recompute the node alloc type now that its caller edges have been
3241   // updated (since we will compute from those edges).
3242   OldCaller->AllocTypes = OldCaller->computeAllocType();
3243   // OldCaller alloc type should be None iff its context id set is now empty.
3244   assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) ==
3245          OldCaller->emptyContextIds());
3246   if (VerifyCCG) {
3247     checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /*CheckEdges=*/false);
3248     checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /*CheckEdges=*/false);
3249     for (const auto &OldCallerEdge : OldCaller->CallerEdges)
3250       checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller,
3251                                             /*CheckEdges=*/false);
3252     for (const auto &NewCallerEdge : NewCaller->CallerEdges)
3253       checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller,
3254                                             /*CheckEdges=*/false);
3255   }
3256 }
3257 
3258 template <typename DerivedCCG, typename FuncTy, typename CallTy>
3259 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
3260     recursivelyRemoveNoneTypeCalleeEdges(
3261         ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
3262   auto Inserted = Visited.insert(Node);
3263   if (!Inserted.second)
3264     return;
3265 
3266   removeNoneTypeCalleeEdges(Node);
3267 
3268   for (auto *Clone : Node->Clones)
3269     recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited);
3270 
3271   // The recursive call may remove some of this Node's caller edges.
3272   // Iterate over a copy and skip any that were removed.
3273   auto CallerEdges = Node->CallerEdges;
3274   for (auto &Edge : CallerEdges) {
3275     // Skip any that have been removed by an earlier recursive call.
3276     if (Edge->isRemoved()) {
3277       assert(!is_contained(Node->CallerEdges, Edge));
3278       continue;
3279     }
3280     recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited);
3281   }
3282 }
3283 
3284 template <typename DerivedCCG, typename FuncTy, typename CallTy>
3285 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3286   DenseSet<const ContextNode *> Visited;
3287   for (auto &Entry : AllocationCallToContextNodeMap) {
3288     Visited.clear();
3289     identifyClones(Entry.second, Visited, Entry.second->getContextIds());
3290   }
3291   Visited.clear();
3292   for (auto &Entry : AllocationCallToContextNodeMap)
3293     recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited);
3294   if (VerifyCCG)
3295     check();
3296 }
3297 
3298 // helper function to check an AllocType is cold or notcold or both.
3299 bool checkColdOrNotCold(uint8_t AllocType) {
3300   return (AllocType == (uint8_t)AllocationType::Cold) ||
3301          (AllocType == (uint8_t)AllocationType::NotCold) ||
3302          (AllocType ==
3303           ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold));
3304 }
3305 
3306 template <typename DerivedCCG, typename FuncTy, typename CallTy>
3307 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
3308     ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3309     const DenseSet<uint32_t> &AllocContextIds) {
3310   if (VerifyNodes)
3311     checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3312   assert(!Node->CloneOf);
3313 
3314   // If Node as a null call, then either it wasn't found in the module (regular
3315   // LTO) or summary index (ThinLTO), or there were other conditions blocking
3316   // cloning (e.g. recursion, calls multiple targets, etc).
3317   // Do this here so that we don't try to recursively clone callers below, which
3318   // isn't useful at least for this node.
3319   if (!Node->hasCall())
3320     return;
3321 
3322 #ifndef NDEBUG
3323   auto Insert =
3324 #endif
3325       Visited.insert(Node);
3326   // We should not have visited this node yet.
3327   assert(Insert.second);
3328   // The recursive call to identifyClones may delete the current edge from the
3329   // CallerEdges vector. Make a copy and iterate on that, simpler than passing
3330   // in an iterator and having recursive call erase from it. Other edges may
3331   // also get removed during the recursion, which will have null Callee and
3332   // Caller pointers (and are deleted later), so we skip those below.
3333   {
3334     auto CallerEdges = Node->CallerEdges;
3335     for (auto &Edge : CallerEdges) {
3336       // Skip any that have been removed by an earlier recursive call.
3337       if (Edge->isRemoved()) {
3338         assert(!is_contained(Node->CallerEdges, Edge));
3339         continue;
3340       }
3341       // Ignore any caller we previously visited via another edge.
3342       if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
3343         identifyClones(Edge->Caller, Visited, AllocContextIds);
3344       }
3345     }
3346   }
3347 
3348   // Check if we reached an unambiguous call or have have only a single caller.
3349   if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3350     return;
3351 
3352   // We need to clone.
3353 
3354   // Try to keep the original version as alloc type NotCold. This will make
3355   // cases with indirect calls or any other situation with an unknown call to
3356   // the original function get the default behavior. We do this by sorting the
3357   // CallerEdges of the Node we will clone by alloc type.
3358   //
3359   // Give NotCold edge the lowest sort priority so those edges are at the end of
3360   // the caller edges vector, and stay on the original version (since the below
3361   // code clones greedily until it finds all remaining edges have the same type
3362   // and leaves the remaining ones on the original Node).
3363   //
3364   // We shouldn't actually have any None type edges, so the sorting priority for
3365   // that is arbitrary, and we assert in that case below.
3366   const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
3367                                                /*Cold*/ 1,
3368                                                /*NotColdCold*/ 2};
3369   std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(),
3370                    [&](const std::shared_ptr<ContextEdge> &A,
3371                        const std::shared_ptr<ContextEdge> &B) {
3372                      // Nodes with non-empty context ids should be sorted before
3373                      // those with empty context ids.
3374                      if (A->ContextIds.empty())
3375                        // Either B ContextIds are non-empty (in which case we
3376                        // should return false because B < A), or B ContextIds
3377                        // are empty, in which case they are equal, and we should
3378                        // maintain the original relative ordering.
3379                        return false;
3380                      if (B->ContextIds.empty())
3381                        return true;
3382 
3383                      if (A->AllocTypes == B->AllocTypes)
3384                        // Use the first context id for each edge as a
3385                        // tie-breaker.
3386                        return *A->ContextIds.begin() < *B->ContextIds.begin();
3387                      return AllocTypeCloningPriority[A->AllocTypes] <
3388                             AllocTypeCloningPriority[B->AllocTypes];
3389                    });
3390 
3391   assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3392 
3393   DenseSet<uint32_t> RecursiveContextIds;
3394   // If we are allowing recursive callsites, but have also disabled recursive
3395   // contexts, look for context ids that show up in multiple caller edges.
3396   if (AllowRecursiveCallsites && !AllowRecursiveContexts) {
3397     DenseSet<uint32_t> AllCallerContextIds;
3398     for (auto &CE : Node->CallerEdges) {
3399       // Resize to the largest set of caller context ids, since we know the
3400       // final set will be at least that large.
3401       AllCallerContextIds.reserve(CE->getContextIds().size());
3402       for (auto Id : CE->getContextIds())
3403         if (!AllCallerContextIds.insert(Id).second)
3404           RecursiveContextIds.insert(Id);
3405     }
3406   }
3407 
3408   // Iterate until we find no more opportunities for disambiguating the alloc
3409   // types via cloning. In most cases this loop will terminate once the Node
3410   // has a single allocation type, in which case no more cloning is needed.
3411   // Iterate over a copy of Node's caller edges, since we may need to remove
3412   // edges in the moveEdgeTo* methods, and this simplifies the handling and
3413   // makes it less error-prone.
3414   auto CallerEdges = Node->CallerEdges;
3415   for (auto &CallerEdge : CallerEdges) {
3416     // See if cloning the prior caller edge left this node with a single alloc
3417     // type or a single caller. In that case no more cloning of Node is needed.
3418     if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3419       break;
3420 
3421     // If the caller was not successfully matched to a call in the IR/summary,
3422     // there is no point in trying to clone for it as we can't update that call.
3423     if (!CallerEdge->Caller->hasCall())
3424       continue;
3425 
3426     // Only need to process the ids along this edge pertaining to the given
3427     // allocation.
3428     auto CallerEdgeContextsForAlloc =
3429         set_intersection(CallerEdge->getContextIds(), AllocContextIds);
3430     if (!RecursiveContextIds.empty())
3431       CallerEdgeContextsForAlloc =
3432           set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds);
3433     if (CallerEdgeContextsForAlloc.empty())
3434       continue;
3435 
3436     auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
3437 
3438     // Compute the node callee edge alloc types corresponding to the context ids
3439     // for this caller edge.
3440     std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
3441     CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
3442     for (auto &CalleeEdge : Node->CalleeEdges)
3443       CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
3444           CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
3445 
3446     // Don't clone if doing so will not disambiguate any alloc types amongst
3447     // caller edges (including the callee edges that would be cloned).
3448     // Otherwise we will simply move all edges to the clone.
3449     //
3450     // First check if by cloning we will disambiguate the caller allocation
3451     // type from node's allocation type. Query allocTypeToUse so that we don't
3452     // bother cloning to distinguish NotCold+Cold from NotCold. Note that
3453     // neither of these should be None type.
3454     //
3455     // Then check if by cloning node at least one of the callee edges will be
3456     // disambiguated by splitting out different context ids.
3457     assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
3458     assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3459     if (allocTypeToUse(CallerAllocTypeForAlloc) ==
3460             allocTypeToUse(Node->AllocTypes) &&
3461         allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
3462             CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges))
3463       continue;
3464 
3465     // First see if we can use an existing clone. Check each clone and its
3466     // callee edges for matching alloc types.
3467     ContextNode *Clone = nullptr;
3468     for (auto *CurClone : Node->Clones) {
3469       if (allocTypeToUse(CurClone->AllocTypes) !=
3470           allocTypeToUse(CallerAllocTypeForAlloc))
3471         continue;
3472 
3473       bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) &&
3474                              hasSingleAllocType(CallerAllocTypeForAlloc);
3475       // The above check should mean that if both have single alloc types that
3476       // they should be equal.
3477       assert(!BothSingleAlloc ||
3478              CurClone->AllocTypes == CallerAllocTypeForAlloc);
3479 
3480       // If either both have a single alloc type (which are the same), or if the
3481       // clone's callee edges have the same alloc types as those for the current
3482       // allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge),
3483       // then we can reuse this clone.
3484       if (BothSingleAlloc || allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>(
3485                                  CalleeEdgeAllocTypesForCallerEdge, CurClone)) {
3486         Clone = CurClone;
3487         break;
3488       }
3489     }
3490 
3491     // The edge iterator is adjusted when we move the CallerEdge to the clone.
3492     if (Clone)
3493       moveEdgeToExistingCalleeClone(CallerEdge, Clone, /*NewClone=*/false,
3494                                     CallerEdgeContextsForAlloc);
3495     else
3496       Clone = moveEdgeToNewCalleeClone(CallerEdge, CallerEdgeContextsForAlloc);
3497 
3498     // Sanity check that no alloc types on clone or its edges are None.
3499     assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
3500   }
3501 
3502   // We should still have some context ids on the original Node.
3503   assert(!Node->emptyContextIds());
3504 
3505   // Sanity check that no alloc types on node or edges are None.
3506   assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3507 
3508   if (VerifyNodes)
3509     checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3510 }
3511 
3512 void ModuleCallsiteContextGraph::updateAllocationCall(
3513     CallInfo &Call, AllocationType AllocType) {
3514   std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
3515   auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
3516                                 "memprof", AllocTypeString);
3517   cast<CallBase>(Call.call())->addFnAttr(A);
3518   OREGetter(Call.call()->getFunction())
3519       .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
3520             << ore::NV("AllocationCall", Call.call()) << " in clone "
3521             << ore::NV("Caller", Call.call()->getFunction())
3522             << " marked with memprof allocation attribute "
3523             << ore::NV("Attribute", AllocTypeString));
3524 }
3525 
3526 void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
3527                                                      AllocationType AllocType) {
3528   auto *AI = cast<AllocInfo *>(Call.call());
3529   assert(AI);
3530   assert(AI->Versions.size() > Call.cloneNo());
3531   AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
3532 }
3533 
3534 AllocationType
3535 ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
3536   const auto *CB = cast<CallBase>(Call.call());
3537   if (!CB->getAttributes().hasFnAttr("memprof"))
3538     return AllocationType::None;
3539   return CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
3540              ? AllocationType::Cold
3541              : AllocationType::NotCold;
3542 }
3543 
3544 AllocationType
3545 IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
3546   const auto *AI = cast<AllocInfo *>(Call.call());
3547   assert(AI->Versions.size() > Call.cloneNo());
3548   return (AllocationType)AI->Versions[Call.cloneNo()];
3549 }
3550 
3551 void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
3552                                             FuncInfo CalleeFunc) {
3553   if (CalleeFunc.cloneNo() > 0)
3554     cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
3555   OREGetter(CallerCall.call()->getFunction())
3556       .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
3557             << ore::NV("Call", CallerCall.call()) << " in clone "
3558             << ore::NV("Caller", CallerCall.call()->getFunction())
3559             << " assigned to call function clone "
3560             << ore::NV("Callee", CalleeFunc.func()));
3561 }
3562 
3563 void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
3564                                            FuncInfo CalleeFunc) {
3565   auto *CI = cast<CallsiteInfo *>(CallerCall.call());
3566   assert(CI &&
3567          "Caller cannot be an allocation which should not have profiled calls");
3568   assert(CI->Clones.size() > CallerCall.cloneNo());
3569   CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
3570 }
3571 
3572 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
3573                      Instruction *>::FuncInfo
3574 ModuleCallsiteContextGraph::cloneFunctionForCallsite(
3575     FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
3576     std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
3577   // Use existing LLVM facilities for cloning and obtaining Call in clone
3578   ValueToValueMapTy VMap;
3579   auto *NewFunc = CloneFunction(Func.func(), VMap);
3580   std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
3581   assert(!Func.func()->getParent()->getFunction(Name));
3582   NewFunc->setName(Name);
3583   for (auto &Inst : CallsWithMetadataInFunc) {
3584     // This map always has the initial version in it.
3585     assert(Inst.cloneNo() == 0);
3586     CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
3587   }
3588   OREGetter(Func.func())
3589       .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
3590             << "created clone " << ore::NV("NewFunction", NewFunc));
3591   return {NewFunc, CloneNo};
3592 }
3593 
3594 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
3595                      IndexCall>::FuncInfo
3596 IndexCallsiteContextGraph::cloneFunctionForCallsite(
3597     FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
3598     std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
3599   // Check how many clones we have of Call (and therefore function).
3600   // The next clone number is the current size of versions array.
3601   // Confirm this matches the CloneNo provided by the caller, which is based on
3602   // the number of function clones we have.
3603   assert(CloneNo == (isa<AllocInfo *>(Call.call())
3604                          ? cast<AllocInfo *>(Call.call())->Versions.size()
3605                          : cast<CallsiteInfo *>(Call.call())->Clones.size()));
3606   // Walk all the instructions in this function. Create a new version for
3607   // each (by adding an entry to the Versions/Clones summary array), and copy
3608   // over the version being called for the function clone being cloned here.
3609   // Additionally, add an entry to the CallMap for the new function clone,
3610   // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
3611   // to the new call clone.
3612   for (auto &Inst : CallsWithMetadataInFunc) {
3613     // This map always has the initial version in it.
3614     assert(Inst.cloneNo() == 0);
3615     if (auto *AI = dyn_cast<AllocInfo *>(Inst.call())) {
3616       assert(AI->Versions.size() == CloneNo);
3617       // We assign the allocation type later (in updateAllocationCall), just add
3618       // an entry for it here.
3619       AI->Versions.push_back(0);
3620     } else {
3621       auto *CI = cast<CallsiteInfo *>(Inst.call());
3622       assert(CI && CI->Clones.size() == CloneNo);
3623       // We assign the clone number later (in updateCall), just add an entry for
3624       // it here.
3625       CI->Clones.push_back(0);
3626     }
3627     CallMap[Inst] = {Inst.call(), CloneNo};
3628   }
3629   return {Func.func(), CloneNo};
3630 }
3631 
3632 // This method assigns cloned callsites to functions, cloning the functions as
3633 // needed. The assignment is greedy and proceeds roughly as follows:
3634 //
3635 // For each function Func:
3636 //   For each call with graph Node having clones:
3637 //     Initialize ClonesWorklist to Node and its clones
3638 //     Initialize NodeCloneCount to 0
3639 //     While ClonesWorklist is not empty:
3640 //        Clone = pop front ClonesWorklist
3641 //        NodeCloneCount++
3642 //        If Func has been cloned less than NodeCloneCount times:
3643 //           If NodeCloneCount is 1:
3644 //             Assign Clone to original Func
3645 //             Continue
3646 //           Create a new function clone
3647 //           If other callers not assigned to call a function clone yet:
3648 //              Assign them to call new function clone
3649 //              Continue
3650 //           Assign any other caller calling the cloned version to new clone
3651 //
3652 //        For each caller of Clone:
3653 //           If caller is assigned to call a specific function clone:
3654 //             If we cannot assign Clone to that function clone:
3655 //               Create new callsite Clone NewClone
3656 //               Add NewClone to ClonesWorklist
3657 //               Continue
3658 //             Assign Clone to existing caller's called function clone
3659 //           Else:
3660 //             If Clone not already assigned to a function clone:
3661 //                Assign to first function clone without assignment
3662 //             Assign caller to selected function clone
3663 template <typename DerivedCCG, typename FuncTy, typename CallTy>
3664 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
3665   bool Changed = false;
3666 
3667   // Keep track of the assignment of nodes (callsites) to function clones they
3668   // call.
3669   DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
3670 
3671   // Update caller node to call function version CalleeFunc, by recording the
3672   // assignment in CallsiteToCalleeFuncCloneMap.
3673   auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
3674                                         const FuncInfo &CalleeFunc) {
3675     assert(Caller->hasCall());
3676     CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
3677   };
3678 
3679   // Walk all functions for which we saw calls with memprof metadata, and handle
3680   // cloning for each of its calls.
3681   for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
3682     FuncInfo OrigFunc(Func);
3683     // Map from each clone of OrigFunc to a map of remappings of each call of
3684     // interest (from original uncloned call to the corresponding cloned call in
3685     // that function clone).
3686     std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
3687     for (auto &Call : CallsWithMetadata) {
3688       ContextNode *Node = getNodeForInst(Call);
3689       // Skip call if we do not have a node for it (all uses of its stack ids
3690       // were either on inlined chains or pruned from the MIBs), or if we did
3691       // not create any clones for it.
3692       if (!Node || Node->Clones.empty())
3693         continue;
3694       assert(Node->hasCall() &&
3695              "Not having a call should have prevented cloning");
3696 
3697       // Track the assignment of function clones to clones of the current
3698       // callsite Node being handled.
3699       std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
3700 
3701       // Assign callsite version CallsiteClone to function version FuncClone,
3702       // and also assign (possibly cloned) Call to CallsiteClone.
3703       auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
3704                                                 CallInfo &Call,
3705                                                 ContextNode *CallsiteClone,
3706                                                 bool IsAlloc) {
3707         // Record the clone of callsite node assigned to this function clone.
3708         FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
3709 
3710         assert(FuncClonesToCallMap.count(FuncClone));
3711         std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
3712         CallInfo CallClone(Call);
3713         if (CallMap.count(Call))
3714           CallClone = CallMap[Call];
3715         CallsiteClone->setCall(CallClone);
3716         // Need to do the same for all matching calls.
3717         for (auto &MatchingCall : Node->MatchingCalls) {
3718           CallInfo CallClone(MatchingCall);
3719           if (CallMap.count(MatchingCall))
3720             CallClone = CallMap[MatchingCall];
3721           // Updates the call in the list.
3722           MatchingCall = CallClone;
3723         }
3724       };
3725 
3726       // Keep track of the clones of callsite Node that need to be assigned to
3727       // function clones. This list may be expanded in the loop body below if we
3728       // find additional cloning is required.
3729       std::deque<ContextNode *> ClonesWorklist;
3730       // Ignore original Node if we moved all of its contexts to clones.
3731       if (!Node->emptyContextIds())
3732         ClonesWorklist.push_back(Node);
3733       ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(),
3734                             Node->Clones.end());
3735 
3736       // Now walk through all of the clones of this callsite Node that we need,
3737       // and determine the assignment to a corresponding clone of the current
3738       // function (creating new function clones as needed).
3739       unsigned NodeCloneCount = 0;
3740       while (!ClonesWorklist.empty()) {
3741         ContextNode *Clone = ClonesWorklist.front();
3742         ClonesWorklist.pop_front();
3743         NodeCloneCount++;
3744         if (VerifyNodes)
3745           checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
3746 
3747         // Need to create a new function clone if we have more callsite clones
3748         // than existing function clones, which would have been assigned to an
3749         // earlier clone in the list (we assign callsite clones to function
3750         // clones greedily).
3751         if (FuncClonesToCallMap.size() < NodeCloneCount) {
3752           // If this is the first callsite copy, assign to original function.
3753           if (NodeCloneCount == 1) {
3754             // Since FuncClonesToCallMap is empty in this case, no clones have
3755             // been created for this function yet, and no callers should have
3756             // been assigned a function clone for this callee node yet.
3757             assert(llvm::none_of(
3758                 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
3759                   return CallsiteToCalleeFuncCloneMap.count(E->Caller);
3760                 }));
3761             // Initialize with empty call map, assign Clone to original function
3762             // and its callers, and skip to the next clone.
3763             FuncClonesToCallMap[OrigFunc] = {};
3764             AssignCallsiteCloneToFuncClone(
3765                 OrigFunc, Call, Clone,
3766                 AllocationCallToContextNodeMap.count(Call));
3767             for (auto &CE : Clone->CallerEdges) {
3768               // Ignore any caller that does not have a recorded callsite Call.
3769               if (!CE->Caller->hasCall())
3770                 continue;
3771               RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
3772             }
3773             continue;
3774           }
3775 
3776           // First locate which copy of OrigFunc to clone again. If a caller
3777           // of this callsite clone was already assigned to call a particular
3778           // function clone, we need to redirect all of those callers to the
3779           // new function clone, and update their other callees within this
3780           // function.
3781           FuncInfo PreviousAssignedFuncClone;
3782           auto EI = llvm::find_if(
3783               Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
3784                 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
3785               });
3786           bool CallerAssignedToCloneOfFunc = false;
3787           if (EI != Clone->CallerEdges.end()) {
3788             const std::shared_ptr<ContextEdge> &Edge = *EI;
3789             PreviousAssignedFuncClone =
3790                 CallsiteToCalleeFuncCloneMap[Edge->Caller];
3791             CallerAssignedToCloneOfFunc = true;
3792           }
3793 
3794           // Clone function and save it along with the CallInfo map created
3795           // during cloning in the FuncClonesToCallMap.
3796           std::map<CallInfo, CallInfo> NewCallMap;
3797           unsigned CloneNo = FuncClonesToCallMap.size();
3798           assert(CloneNo > 0 && "Clone 0 is the original function, which "
3799                                 "should already exist in the map");
3800           FuncInfo NewFuncClone = cloneFunctionForCallsite(
3801               OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
3802           FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
3803           FunctionClonesAnalysis++;
3804           Changed = true;
3805 
3806           // If no caller callsites were already assigned to a clone of this
3807           // function, we can simply assign this clone to the new func clone
3808           // and update all callers to it, then skip to the next clone.
3809           if (!CallerAssignedToCloneOfFunc) {
3810             AssignCallsiteCloneToFuncClone(
3811                 NewFuncClone, Call, Clone,
3812                 AllocationCallToContextNodeMap.count(Call));
3813             for (auto &CE : Clone->CallerEdges) {
3814               // Ignore any caller that does not have a recorded callsite Call.
3815               if (!CE->Caller->hasCall())
3816                 continue;
3817               RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
3818             }
3819             continue;
3820           }
3821 
3822           // We may need to do additional node cloning in this case.
3823           // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
3824           // that were previously assigned to call PreviousAssignedFuncClone,
3825           // to record that they now call NewFuncClone.
3826           // The none type edge removal may remove some of this Clone's caller
3827           // edges, if it is reached via another of its caller's callees.
3828           // Iterate over a copy and skip any that were removed.
3829           auto CallerEdges = Clone->CallerEdges;
3830           for (auto CE : CallerEdges) {
3831             // Skip any that have been removed on an earlier iteration.
3832             if (CE->isRemoved()) {
3833               assert(!is_contained(Clone->CallerEdges, CE));
3834               continue;
3835             }
3836             assert(CE);
3837             // Ignore any caller that does not have a recorded callsite Call.
3838             if (!CE->Caller->hasCall())
3839               continue;
3840 
3841             if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
3842                 // We subsequently fall through to later handling that
3843                 // will perform any additional cloning required for
3844                 // callers that were calling other function clones.
3845                 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
3846                     PreviousAssignedFuncClone)
3847               continue;
3848 
3849             RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
3850 
3851             // If we are cloning a function that was already assigned to some
3852             // callers, then essentially we are creating new callsite clones
3853             // of the other callsites in that function that are reached by those
3854             // callers. Clone the other callees of the current callsite's caller
3855             // that were already assigned to PreviousAssignedFuncClone
3856             // accordingly. This is important since we subsequently update the
3857             // calls from the nodes in the graph and their assignments to callee
3858             // functions recorded in CallsiteToCalleeFuncCloneMap.
3859             // The none type edge removal may remove some of this caller's
3860             // callee edges, if it is reached via another of its callees.
3861             // Iterate over a copy and skip any that were removed.
3862             auto CalleeEdges = CE->Caller->CalleeEdges;
3863             for (auto CalleeEdge : CalleeEdges) {
3864               // Skip any that have been removed on an earlier iteration when
3865               // cleaning up newly None type callee edges.
3866               if (CalleeEdge->isRemoved()) {
3867                 assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge));
3868                 continue;
3869               }
3870               assert(CalleeEdge);
3871               ContextNode *Callee = CalleeEdge->Callee;
3872               // Skip the current callsite, we are looking for other
3873               // callsites Caller calls, as well as any that does not have a
3874               // recorded callsite Call.
3875               if (Callee == Clone || !Callee->hasCall())
3876                 continue;
3877               ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge);
3878               removeNoneTypeCalleeEdges(NewClone);
3879               // Moving the edge may have resulted in some none type
3880               // callee edges on the original Callee.
3881               removeNoneTypeCalleeEdges(Callee);
3882               assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
3883               // If the Callee node was already assigned to call a specific
3884               // function version, make sure its new clone is assigned to call
3885               // that same function clone.
3886               if (CallsiteToCalleeFuncCloneMap.count(Callee))
3887                 RecordCalleeFuncOfCallsite(
3888                     NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
3889               // Update NewClone with the new Call clone of this callsite's Call
3890               // created for the new function clone created earlier.
3891               // Recall that we have already ensured when building the graph
3892               // that each caller can only call callsites within the same
3893               // function, so we are guaranteed that Callee Call is in the
3894               // current OrigFunc.
3895               // CallMap is set up as indexed by original Call at clone 0.
3896               CallInfo OrigCall(Callee->getOrigNode()->Call);
3897               OrigCall.setCloneNo(0);
3898               std::map<CallInfo, CallInfo> &CallMap =
3899                   FuncClonesToCallMap[NewFuncClone];
3900               assert(CallMap.count(OrigCall));
3901               CallInfo NewCall(CallMap[OrigCall]);
3902               assert(NewCall);
3903               NewClone->setCall(NewCall);
3904               // Need to do the same for all matching calls.
3905               for (auto &MatchingCall : NewClone->MatchingCalls) {
3906                 CallInfo OrigMatchingCall(MatchingCall);
3907                 OrigMatchingCall.setCloneNo(0);
3908                 assert(CallMap.count(OrigMatchingCall));
3909                 CallInfo NewCall(CallMap[OrigMatchingCall]);
3910                 assert(NewCall);
3911                 // Updates the call in the list.
3912                 MatchingCall = NewCall;
3913               }
3914             }
3915           }
3916           // Fall through to handling below to perform the recording of the
3917           // function for this callsite clone. This enables handling of cases
3918           // where the callers were assigned to different clones of a function.
3919         }
3920 
3921         // See if we can use existing function clone. Walk through
3922         // all caller edges to see if any have already been assigned to
3923         // a clone of this callsite's function. If we can use it, do so. If not,
3924         // because that function clone is already assigned to a different clone
3925         // of this callsite, then we need to clone again.
3926         // Basically, this checking is needed to handle the case where different
3927         // caller functions/callsites may need versions of this function
3928         // containing different mixes of callsite clones across the different
3929         // callsites within the function. If that happens, we need to create
3930         // additional function clones to handle the various combinations.
3931         //
3932         // Keep track of any new clones of this callsite created by the
3933         // following loop, as well as any existing clone that we decided to
3934         // assign this clone to.
3935         std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
3936         FuncInfo FuncCloneAssignedToCurCallsiteClone;
3937         // Iterate over a copy of Clone's caller edges, since we may need to
3938         // remove edges in the moveEdgeTo* methods, and this simplifies the
3939         // handling and makes it less error-prone.
3940         auto CloneCallerEdges = Clone->CallerEdges;
3941         for (auto &Edge : CloneCallerEdges) {
3942           // Ignore any caller that does not have a recorded callsite Call.
3943           if (!Edge->Caller->hasCall())
3944             continue;
3945           // If this caller already assigned to call a version of OrigFunc, need
3946           // to ensure we can assign this callsite clone to that function clone.
3947           if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
3948             FuncInfo FuncCloneCalledByCaller =
3949                 CallsiteToCalleeFuncCloneMap[Edge->Caller];
3950             // First we need to confirm that this function clone is available
3951             // for use by this callsite node clone.
3952             //
3953             // While FuncCloneToCurNodeCloneMap is built only for this Node and
3954             // its callsite clones, one of those callsite clones X could have
3955             // been assigned to the same function clone called by Edge's caller
3956             // - if Edge's caller calls another callsite within Node's original
3957             // function, and that callsite has another caller reaching clone X.
3958             // We need to clone Node again in this case.
3959             if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
3960                  FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
3961                      Clone) ||
3962                 // Detect when we have multiple callers of this callsite that
3963                 // have already been assigned to specific, and different, clones
3964                 // of OrigFunc (due to other unrelated callsites in Func they
3965                 // reach via call contexts). Is this Clone of callsite Node
3966                 // assigned to a different clone of OrigFunc? If so, clone Node
3967                 // again.
3968                 (FuncCloneAssignedToCurCallsiteClone &&
3969                  FuncCloneAssignedToCurCallsiteClone !=
3970                      FuncCloneCalledByCaller)) {
3971               // We need to use a different newly created callsite clone, in
3972               // order to assign it to another new function clone on a
3973               // subsequent iteration over the Clones array (adjusted below).
3974               // Note we specifically do not reset the
3975               // CallsiteToCalleeFuncCloneMap entry for this caller, so that
3976               // when this new clone is processed later we know which version of
3977               // the function to copy (so that other callsite clones we have
3978               // assigned to that function clone are properly cloned over). See
3979               // comments in the function cloning handling earlier.
3980 
3981               // Check if we already have cloned this callsite again while
3982               // walking through caller edges, for a caller calling the same
3983               // function clone. If so, we can move this edge to that new clone
3984               // rather than creating yet another new clone.
3985               if (FuncCloneToNewCallsiteCloneMap.count(
3986                       FuncCloneCalledByCaller)) {
3987                 ContextNode *NewClone =
3988                     FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
3989                 moveEdgeToExistingCalleeClone(Edge, NewClone);
3990                 // Cleanup any none type edges cloned over.
3991                 removeNoneTypeCalleeEdges(NewClone);
3992               } else {
3993                 // Create a new callsite clone.
3994                 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge);
3995                 removeNoneTypeCalleeEdges(NewClone);
3996                 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
3997                     NewClone;
3998                 // Add to list of clones and process later.
3999                 ClonesWorklist.push_back(NewClone);
4000                 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
4001               }
4002               // Moving the caller edge may have resulted in some none type
4003               // callee edges.
4004               removeNoneTypeCalleeEdges(Clone);
4005               // We will handle the newly created callsite clone in a subsequent
4006               // iteration over this Node's Clones.
4007               continue;
4008             }
4009 
4010             // Otherwise, we can use the function clone already assigned to this
4011             // caller.
4012             if (!FuncCloneAssignedToCurCallsiteClone) {
4013               FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
4014               // Assign Clone to FuncCloneCalledByCaller
4015               AssignCallsiteCloneToFuncClone(
4016                   FuncCloneCalledByCaller, Call, Clone,
4017                   AllocationCallToContextNodeMap.count(Call));
4018             } else
4019               // Don't need to do anything - callsite is already calling this
4020               // function clone.
4021               assert(FuncCloneAssignedToCurCallsiteClone ==
4022                      FuncCloneCalledByCaller);
4023 
4024           } else {
4025             // We have not already assigned this caller to a version of
4026             // OrigFunc. Do the assignment now.
4027 
4028             // First check if we have already assigned this callsite clone to a
4029             // clone of OrigFunc for another caller during this iteration over
4030             // its caller edges.
4031             if (!FuncCloneAssignedToCurCallsiteClone) {
4032               // Find first function in FuncClonesToCallMap without an assigned
4033               // clone of this callsite Node. We should always have one
4034               // available at this point due to the earlier cloning when the
4035               // FuncClonesToCallMap size was smaller than the clone number.
4036               for (auto &CF : FuncClonesToCallMap) {
4037                 if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
4038                   FuncCloneAssignedToCurCallsiteClone = CF.first;
4039                   break;
4040                 }
4041               }
4042               assert(FuncCloneAssignedToCurCallsiteClone);
4043               // Assign Clone to FuncCloneAssignedToCurCallsiteClone
4044               AssignCallsiteCloneToFuncClone(
4045                   FuncCloneAssignedToCurCallsiteClone, Call, Clone,
4046                   AllocationCallToContextNodeMap.count(Call));
4047             } else
4048               assert(FuncCloneToCurNodeCloneMap
4049                          [FuncCloneAssignedToCurCallsiteClone] == Clone);
4050             // Update callers to record function version called.
4051             RecordCalleeFuncOfCallsite(Edge->Caller,
4052                                        FuncCloneAssignedToCurCallsiteClone);
4053           }
4054         }
4055       }
4056       if (VerifyCCG) {
4057         checkNode<DerivedCCG, FuncTy, CallTy>(Node);
4058         for (const auto &PE : Node->CalleeEdges)
4059           checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
4060         for (const auto &CE : Node->CallerEdges)
4061           checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
4062         for (auto *Clone : Node->Clones) {
4063           checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
4064           for (const auto &PE : Clone->CalleeEdges)
4065             checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
4066           for (const auto &CE : Clone->CallerEdges)
4067             checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
4068         }
4069       }
4070     }
4071   }
4072 
4073   uint8_t BothTypes =
4074       (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
4075 
4076   auto UpdateCalls = [&](ContextNode *Node,
4077                          DenseSet<const ContextNode *> &Visited,
4078                          auto &&UpdateCalls) {
4079     auto Inserted = Visited.insert(Node);
4080     if (!Inserted.second)
4081       return;
4082 
4083     for (auto *Clone : Node->Clones)
4084       UpdateCalls(Clone, Visited, UpdateCalls);
4085 
4086     for (auto &Edge : Node->CallerEdges)
4087       UpdateCalls(Edge->Caller, Visited, UpdateCalls);
4088 
4089     // Skip if either no call to update, or if we ended up with no context ids
4090     // (we moved all edges onto other clones).
4091     if (!Node->hasCall() || Node->emptyContextIds())
4092       return;
4093 
4094     if (Node->IsAllocation) {
4095       auto AT = allocTypeToUse(Node->AllocTypes);
4096       // If the allocation type is ambiguous, and more aggressive hinting
4097       // has been enabled via the MinClonedColdBytePercent flag, see if this
4098       // allocation should be hinted cold anyway because its fraction cold bytes
4099       // allocated is at least the given threshold.
4100       if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 &&
4101           !ContextIdToContextSizeInfos.empty()) {
4102         uint64_t TotalCold = 0;
4103         uint64_t Total = 0;
4104         for (auto Id : Node->getContextIds()) {
4105           auto TypeI = ContextIdToAllocationType.find(Id);
4106           assert(TypeI != ContextIdToAllocationType.end());
4107           auto CSI = ContextIdToContextSizeInfos.find(Id);
4108           if (CSI != ContextIdToContextSizeInfos.end()) {
4109             for (auto &Info : CSI->second) {
4110               Total += Info.TotalSize;
4111               if (TypeI->second == AllocationType::Cold)
4112                 TotalCold += Info.TotalSize;
4113             }
4114           }
4115         }
4116         if (TotalCold * 100 >= Total * MinClonedColdBytePercent)
4117           AT = AllocationType::Cold;
4118       }
4119       updateAllocationCall(Node->Call, AT);
4120       assert(Node->MatchingCalls.empty());
4121       return;
4122     }
4123 
4124     if (!CallsiteToCalleeFuncCloneMap.count(Node))
4125       return;
4126 
4127     auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
4128     updateCall(Node->Call, CalleeFunc);
4129     // Update all the matching calls as well.
4130     for (auto &Call : Node->MatchingCalls)
4131       updateCall(Call, CalleeFunc);
4132   };
4133 
4134   // Performs DFS traversal starting from allocation nodes to update calls to
4135   // reflect cloning decisions recorded earlier. For regular LTO this will
4136   // update the actual calls in the IR to call the appropriate function clone
4137   // (and add attributes to allocation calls), whereas for ThinLTO the decisions
4138   // are recorded in the summary entries.
4139   DenseSet<const ContextNode *> Visited;
4140   for (auto &Entry : AllocationCallToContextNodeMap)
4141     UpdateCalls(Entry.second, Visited, UpdateCalls);
4142 
4143   return Changed;
4144 }
4145 
4146 static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones(
4147     Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
4148     std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
4149         &FuncToAliasMap) {
4150   // The first "clone" is the original copy, we should only call this if we
4151   // needed to create new clones.
4152   assert(NumClones > 1);
4153   SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
4154   VMaps.reserve(NumClones - 1);
4155   FunctionsClonedThinBackend++;
4156   for (unsigned I = 1; I < NumClones; I++) {
4157     VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
4158     auto *NewF = CloneFunction(&F, *VMaps.back());
4159     FunctionClonesThinBackend++;
4160     // Strip memprof and callsite metadata from clone as they are no longer
4161     // needed.
4162     for (auto &BB : *NewF) {
4163       for (auto &Inst : BB) {
4164         Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
4165         Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
4166       }
4167     }
4168     std::string Name = getMemProfFuncName(F.getName(), I);
4169     auto *PrevF = M.getFunction(Name);
4170     if (PrevF) {
4171       // We might have created this when adjusting callsite in another
4172       // function. It should be a declaration.
4173       assert(PrevF->isDeclaration());
4174       NewF->takeName(PrevF);
4175       PrevF->replaceAllUsesWith(NewF);
4176       PrevF->eraseFromParent();
4177     } else
4178       NewF->setName(Name);
4179     ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
4180              << "created clone " << ore::NV("NewFunction", NewF));
4181 
4182     // Now handle aliases to this function, and clone those as well.
4183     if (!FuncToAliasMap.count(&F))
4184       continue;
4185     for (auto *A : FuncToAliasMap[&F]) {
4186       std::string Name = getMemProfFuncName(A->getName(), I);
4187       auto *PrevA = M.getNamedAlias(Name);
4188       auto *NewA = GlobalAlias::create(A->getValueType(),
4189                                        A->getType()->getPointerAddressSpace(),
4190                                        A->getLinkage(), Name, NewF);
4191       NewA->copyAttributesFrom(A);
4192       if (PrevA) {
4193         // We might have created this when adjusting callsite in another
4194         // function. It should be a declaration.
4195         assert(PrevA->isDeclaration());
4196         NewA->takeName(PrevA);
4197         PrevA->replaceAllUsesWith(NewA);
4198         PrevA->eraseFromParent();
4199       }
4200     }
4201   }
4202   return VMaps;
4203 }
4204 
4205 // Locate the summary for F. This is complicated by the fact that it might
4206 // have been internalized or promoted.
4207 static ValueInfo findValueInfoForFunc(const Function &F, const Module &M,
4208                                       const ModuleSummaryIndex *ImportSummary) {
4209   // FIXME: Ideally we would retain the original GUID in some fashion on the
4210   // function (e.g. as metadata), but for now do our best to locate the
4211   // summary without that information.
4212   ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
4213   if (!TheFnVI)
4214     // See if theFn was internalized, by checking index directly with
4215     // original name (this avoids the name adjustment done by getGUID() for
4216     // internal symbols).
4217     TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName()));
4218   if (TheFnVI)
4219     return TheFnVI;
4220   // Now query with the original name before any promotion was performed.
4221   StringRef OrigName =
4222       ModuleSummaryIndex::getOriginalNameBeforePromote(F.getName());
4223   std::string OrigId = GlobalValue::getGlobalIdentifier(
4224       OrigName, GlobalValue::InternalLinkage, M.getSourceFileName());
4225   TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId));
4226   if (TheFnVI)
4227     return TheFnVI;
4228   // Could be a promoted local imported from another module. We need to pass
4229   // down more info here to find the original module id. For now, try with
4230   // the OrigName which might have been stored in the OidGuidMap in the
4231   // index. This would not work if there were same-named locals in multiple
4232   // modules, however.
4233   auto OrigGUID =
4234       ImportSummary->getGUIDFromOriginalID(GlobalValue::getGUID(OrigName));
4235   if (OrigGUID)
4236     TheFnVI = ImportSummary->getValueInfo(OrigGUID);
4237   return TheFnVI;
4238 }
4239 
4240 bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo(
4241     Module &M) {
4242   ICallAnalysis = std::make_unique<ICallPromotionAnalysis>();
4243   Symtab = std::make_unique<InstrProfSymtab>();
4244   // Don't add canonical names, to avoid multiple functions to the symtab
4245   // when they both have the same root name with "." suffixes stripped.
4246   // If we pick the wrong one then this could lead to incorrect ICP and calling
4247   // a memprof clone that we don't actually create (resulting in linker unsats).
4248   // What this means is that the GUID of the function (or its PGOFuncName
4249   // metadata) *must* match that in the VP metadata to allow promotion.
4250   // In practice this should not be a limitation, since local functions should
4251   // have PGOFuncName metadata and global function names shouldn't need any
4252   // special handling (they should not get the ".llvm.*" suffix that the
4253   // canonicalization handling is attempting to strip).
4254   if (Error E = Symtab->create(M, /*InLTO=*/true, /*AddCanonical=*/false)) {
4255     std::string SymtabFailure = toString(std::move(E));
4256     M.getContext().emitError("Failed to create symtab: " + SymtabFailure);
4257     return false;
4258   }
4259   return true;
4260 }
4261 
4262 bool MemProfContextDisambiguation::applyImport(Module &M) {
4263   assert(ImportSummary);
4264   bool Changed = false;
4265 
4266   // We also need to clone any aliases that reference cloned functions, because
4267   // the modified callsites may invoke via the alias. Keep track of the aliases
4268   // for each function.
4269   std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
4270       FuncToAliasMap;
4271   for (auto &A : M.aliases()) {
4272     auto *Aliasee = A.getAliaseeObject();
4273     if (auto *F = dyn_cast<Function>(Aliasee))
4274       FuncToAliasMap[F].insert(&A);
4275   }
4276 
4277   if (!initializeIndirectCallPromotionInfo(M))
4278     return false;
4279 
4280   for (auto &F : M) {
4281     if (F.isDeclaration() || isMemProfClone(F))
4282       continue;
4283 
4284     OptimizationRemarkEmitter ORE(&F);
4285 
4286     SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
4287     bool ClonesCreated = false;
4288     unsigned NumClonesCreated = 0;
4289     auto CloneFuncIfNeeded = [&](unsigned NumClones) {
4290       // We should at least have version 0 which is the original copy.
4291       assert(NumClones > 0);
4292       // If only one copy needed use original.
4293       if (NumClones == 1)
4294         return;
4295       // If we already performed cloning of this function, confirm that the
4296       // requested number of clones matches (the thin link should ensure the
4297       // number of clones for each constituent callsite is consistent within
4298       // each function), before returning.
4299       if (ClonesCreated) {
4300         assert(NumClonesCreated == NumClones);
4301         return;
4302       }
4303       VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
4304       // The first "clone" is the original copy, which doesn't have a VMap.
4305       assert(VMaps.size() == NumClones - 1);
4306       Changed = true;
4307       ClonesCreated = true;
4308       NumClonesCreated = NumClones;
4309     };
4310 
4311     auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
4312                              Function *CalledFunction) {
4313       // Perform cloning if not yet done.
4314       CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
4315 
4316       assert(!isMemProfClone(*CalledFunction));
4317 
4318       // Update the calls per the summary info.
4319       // Save orig name since it gets updated in the first iteration
4320       // below.
4321       auto CalleeOrigName = CalledFunction->getName();
4322       for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
4323         // Do nothing if this version calls the original version of its
4324         // callee.
4325         if (!StackNode.Clones[J])
4326           continue;
4327         auto NewF = M.getOrInsertFunction(
4328             getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
4329             CalledFunction->getFunctionType());
4330         CallBase *CBClone;
4331         // Copy 0 is the original function.
4332         if (!J)
4333           CBClone = CB;
4334         else
4335           CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
4336         CBClone->setCalledFunction(NewF);
4337         ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
4338                  << ore::NV("Call", CBClone) << " in clone "
4339                  << ore::NV("Caller", CBClone->getFunction())
4340                  << " assigned to call function clone "
4341                  << ore::NV("Callee", NewF.getCallee()));
4342       }
4343     };
4344 
4345     // Locate the summary for F.
4346     ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
4347     // If not found, this could be an imported local (see comment in
4348     // findValueInfoForFunc). Skip for now as it will be cloned in its original
4349     // module (where it would have been promoted to global scope so should
4350     // satisfy any reference in this module).
4351     if (!TheFnVI)
4352       continue;
4353 
4354     auto *GVSummary =
4355         ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
4356     if (!GVSummary) {
4357       // Must have been imported, use the summary which matches the definition。
4358       // (might be multiple if this was a linkonce_odr).
4359       auto SrcModuleMD = F.getMetadata("thinlto_src_module");
4360       assert(SrcModuleMD &&
4361              "enable-import-metadata is needed to emit thinlto_src_module");
4362       StringRef SrcModule =
4363           dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();
4364       for (auto &GVS : TheFnVI.getSummaryList()) {
4365         if (GVS->modulePath() == SrcModule) {
4366           GVSummary = GVS.get();
4367           break;
4368         }
4369       }
4370       assert(GVSummary && GVSummary->modulePath() == SrcModule);
4371     }
4372 
4373     // If this was an imported alias skip it as we won't have the function
4374     // summary, and it should be cloned in the original module.
4375     if (isa<AliasSummary>(GVSummary))
4376       continue;
4377 
4378     auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
4379 
4380     if (FS->allocs().empty() && FS->callsites().empty())
4381       continue;
4382 
4383     auto SI = FS->callsites().begin();
4384     auto AI = FS->allocs().begin();
4385 
4386     // To handle callsite infos synthesized for tail calls which have missing
4387     // frames in the profiled context, map callee VI to the synthesized callsite
4388     // info.
4389     DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
4390     // Iterate the callsites for this function in reverse, since we place all
4391     // those synthesized for tail calls at the end.
4392     for (auto CallsiteIt = FS->callsites().rbegin();
4393          CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
4394       auto &Callsite = *CallsiteIt;
4395       // Stop as soon as we see a non-synthesized callsite info (see comment
4396       // above loop). All the entries added for discovered tail calls have empty
4397       // stack ids.
4398       if (!Callsite.StackIdIndices.empty())
4399         break;
4400       MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
4401     }
4402 
4403     // Keeps track of needed ICP for the function.
4404     SmallVector<ICallAnalysisData> ICallAnalysisInfo;
4405 
4406     // Assume for now that the instructions are in the exact same order
4407     // as when the summary was created, but confirm this is correct by
4408     // matching the stack ids.
4409     for (auto &BB : F) {
4410       for (auto &I : BB) {
4411         auto *CB = dyn_cast<CallBase>(&I);
4412         // Same handling as when creating module summary.
4413         if (!mayHaveMemprofSummary(CB))
4414           continue;
4415 
4416         auto *CalledValue = CB->getCalledOperand();
4417         auto *CalledFunction = CB->getCalledFunction();
4418         if (CalledValue && !CalledFunction) {
4419           CalledValue = CalledValue->stripPointerCasts();
4420           // Stripping pointer casts can reveal a called function.
4421           CalledFunction = dyn_cast<Function>(CalledValue);
4422         }
4423         // Check if this is an alias to a function. If so, get the
4424         // called aliasee for the checks below.
4425         if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
4426           assert(!CalledFunction &&
4427                  "Expected null called function in callsite for alias");
4428           CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
4429         }
4430 
4431         CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
4432             I.getMetadata(LLVMContext::MD_callsite));
4433         auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
4434 
4435         // Include allocs that were already assigned a memprof function
4436         // attribute in the statistics.
4437         if (CB->getAttributes().hasFnAttr("memprof")) {
4438           assert(!MemProfMD);
4439           CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
4440               ? AllocTypeColdThinBackend++
4441               : AllocTypeNotColdThinBackend++;
4442           OrigAllocsThinBackend++;
4443           AllocVersionsThinBackend++;
4444           if (!MaxAllocVersionsThinBackend)
4445             MaxAllocVersionsThinBackend = 1;
4446           continue;
4447         }
4448 
4449         if (MemProfMD) {
4450           // Consult the next alloc node.
4451           assert(AI != FS->allocs().end());
4452           auto &AllocNode = *(AI++);
4453 
4454           // Sanity check that the MIB stack ids match between the summary and
4455           // instruction metadata.
4456           auto MIBIter = AllocNode.MIBs.begin();
4457           for (auto &MDOp : MemProfMD->operands()) {
4458             assert(MIBIter != AllocNode.MIBs.end());
4459             LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter =
4460                 MIBIter->StackIdIndices.begin();
4461             auto *MIBMD = cast<const MDNode>(MDOp);
4462             MDNode *StackMDNode = getMIBStackNode(MIBMD);
4463             assert(StackMDNode);
4464             CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
4465             auto ContextIterBegin =
4466                 StackContext.beginAfterSharedPrefix(CallsiteContext);
4467             // Skip the checking on the first iteration.
4468             uint64_t LastStackContextId =
4469                 (ContextIterBegin != StackContext.end() &&
4470                  *ContextIterBegin == 0)
4471                     ? 1
4472                     : 0;
4473             for (auto ContextIter = ContextIterBegin;
4474                  ContextIter != StackContext.end(); ++ContextIter) {
4475               // If this is a direct recursion, simply skip the duplicate
4476               // entries, to be consistent with how the summary ids were
4477               // generated during ModuleSummaryAnalysis.
4478               if (LastStackContextId == *ContextIter)
4479                 continue;
4480               LastStackContextId = *ContextIter;
4481               assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
4482               assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
4483                      *ContextIter);
4484               StackIdIndexIter++;
4485             }
4486             MIBIter++;
4487           }
4488 
4489           // Perform cloning if not yet done.
4490           CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
4491 
4492           OrigAllocsThinBackend++;
4493           AllocVersionsThinBackend += AllocNode.Versions.size();
4494           if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
4495             MaxAllocVersionsThinBackend = AllocNode.Versions.size();
4496 
4497           // If there is only one version that means we didn't end up
4498           // considering this function for cloning, and in that case the alloc
4499           // will still be none type or should have gotten the default NotCold.
4500           // Skip that after calling clone helper since that does some sanity
4501           // checks that confirm we haven't decided yet that we need cloning.
4502           // We might have a single version that is cold due to the
4503           // MinClonedColdBytePercent heuristic, make sure we don't skip in that
4504           // case.
4505           if (AllocNode.Versions.size() == 1 &&
4506               (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) {
4507             assert((AllocationType)AllocNode.Versions[0] ==
4508                        AllocationType::NotCold ||
4509                    (AllocationType)AllocNode.Versions[0] ==
4510                        AllocationType::None);
4511             UnclonableAllocsThinBackend++;
4512             continue;
4513           }
4514 
4515           // All versions should have a singular allocation type.
4516           assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
4517             return Type == ((uint8_t)AllocationType::NotCold |
4518                             (uint8_t)AllocationType::Cold);
4519           }));
4520 
4521           // Update the allocation types per the summary info.
4522           for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
4523             // Ignore any that didn't get an assigned allocation type.
4524             if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
4525               continue;
4526             AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
4527             AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
4528                                             : AllocTypeNotColdThinBackend++;
4529             std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
4530             auto A = llvm::Attribute::get(F.getContext(), "memprof",
4531                                           AllocTypeString);
4532             CallBase *CBClone;
4533             // Copy 0 is the original function.
4534             if (!J)
4535               CBClone = CB;
4536             else
4537               // Since VMaps are only created for new clones, we index with
4538               // clone J-1 (J==0 is the original clone and does not have a VMaps
4539               // entry).
4540               CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
4541             CBClone->addFnAttr(A);
4542             ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
4543                      << ore::NV("AllocationCall", CBClone) << " in clone "
4544                      << ore::NV("Caller", CBClone->getFunction())
4545                      << " marked with memprof allocation attribute "
4546                      << ore::NV("Attribute", AllocTypeString));
4547           }
4548         } else if (!CallsiteContext.empty()) {
4549           if (!CalledFunction) {
4550 #ifndef NDEBUG
4551             // We should have skipped inline assembly calls.
4552             auto *CI = dyn_cast<CallInst>(CB);
4553             assert(!CI || !CI->isInlineAsm());
4554 #endif
4555             // We should have skipped direct calls via a Constant.
4556             assert(CalledValue && !isa<Constant>(CalledValue));
4557 
4558             // This is an indirect call, see if we have profile information and
4559             // whether any clones were recorded for the profiled targets (that
4560             // we synthesized CallsiteInfo summary records for when building the
4561             // index).
4562             auto NumClones =
4563                 recordICPInfo(CB, FS->callsites(), SI, ICallAnalysisInfo);
4564 
4565             // Perform cloning if not yet done. This is done here in case
4566             // we don't need to do ICP, but might need to clone this
4567             // function as it is the target of other cloned calls.
4568             if (NumClones)
4569               CloneFuncIfNeeded(NumClones);
4570           }
4571 
4572           else {
4573             // Consult the next callsite node.
4574             assert(SI != FS->callsites().end());
4575             auto &StackNode = *(SI++);
4576 
4577 #ifndef NDEBUG
4578             // Sanity check that the stack ids match between the summary and
4579             // instruction metadata.
4580             auto StackIdIndexIter = StackNode.StackIdIndices.begin();
4581             for (auto StackId : CallsiteContext) {
4582               assert(StackIdIndexIter != StackNode.StackIdIndices.end());
4583               assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
4584                      StackId);
4585               StackIdIndexIter++;
4586             }
4587 #endif
4588 
4589             CloneCallsite(StackNode, CB, CalledFunction);
4590           }
4591         } else if (CB->isTailCall() && CalledFunction) {
4592           // Locate the synthesized callsite info for the callee VI, if any was
4593           // created, and use that for cloning.
4594           ValueInfo CalleeVI =
4595               findValueInfoForFunc(*CalledFunction, M, ImportSummary);
4596           if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
4597             auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
4598             assert(Callsite != MapTailCallCalleeVIToCallsite.end());
4599             CloneCallsite(Callsite->second, CB, CalledFunction);
4600           }
4601         }
4602       }
4603     }
4604 
4605     // Now do any promotion required for cloning.
4606     performICP(M, FS->callsites(), VMaps, ICallAnalysisInfo, ORE);
4607   }
4608 
4609   // We skip some of the functions and instructions above, so remove all the
4610   // metadata in a single sweep here.
4611   for (auto &F : M) {
4612     // We can skip memprof clones because createFunctionClones already strips
4613     // the metadata from the newly created clones.
4614     if (F.isDeclaration() || isMemProfClone(F))
4615       continue;
4616     for (auto &BB : F) {
4617       for (auto &I : BB) {
4618         if (!isa<CallBase>(I))
4619           continue;
4620         I.setMetadata(LLVMContext::MD_memprof, nullptr);
4621         I.setMetadata(LLVMContext::MD_callsite, nullptr);
4622       }
4623     }
4624   }
4625 
4626   return Changed;
4627 }
4628 
4629 unsigned MemProfContextDisambiguation::recordICPInfo(
4630     CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
4631     ArrayRef<CallsiteInfo>::iterator &SI,
4632     SmallVector<ICallAnalysisData> &ICallAnalysisInfo) {
4633   // First see if we have profile information for this indirect call.
4634   uint32_t NumCandidates;
4635   uint64_t TotalCount;
4636   auto CandidateProfileData =
4637       ICallAnalysis->getPromotionCandidatesForInstruction(CB, TotalCount,
4638                                                           NumCandidates);
4639   if (CandidateProfileData.empty())
4640     return 0;
4641 
4642   // Iterate through all of the candidate profiled targets along with the
4643   // CallsiteInfo summary records synthesized for them when building the index,
4644   // and see if any are cloned and/or refer to clones.
4645   bool ICPNeeded = false;
4646   unsigned NumClones = 0;
4647   size_t CallsiteInfoStartIndex = std::distance(AllCallsites.begin(), SI);
4648   for (const auto &Candidate : CandidateProfileData) {
4649 #ifndef NDEBUG
4650     auto CalleeValueInfo =
4651 #endif
4652         ImportSummary->getValueInfo(Candidate.Value);
4653     // We might not have a ValueInfo if this is a distributed
4654     // ThinLTO backend and decided not to import that function.
4655     assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo);
4656     assert(SI != AllCallsites.end());
4657     auto &StackNode = *(SI++);
4658     // See if any of the clones of the indirect callsite for this
4659     // profiled target should call a cloned version of the profiled
4660     // target. We only need to do the ICP here if so.
4661     ICPNeeded |= llvm::any_of(StackNode.Clones,
4662                               [](unsigned CloneNo) { return CloneNo != 0; });
4663     // Every callsite in the same function should have been cloned the same
4664     // number of times.
4665     assert(!NumClones || NumClones == StackNode.Clones.size());
4666     NumClones = StackNode.Clones.size();
4667   }
4668   if (!ICPNeeded)
4669     return NumClones;
4670   // Save information for ICP, which is performed later to avoid messing up the
4671   // current function traversal.
4672   ICallAnalysisInfo.push_back({CB, CandidateProfileData.vec(), NumCandidates,
4673                                TotalCount, CallsiteInfoStartIndex});
4674   return NumClones;
4675 }
4676 
4677 void MemProfContextDisambiguation::performICP(
4678     Module &M, ArrayRef<CallsiteInfo> AllCallsites,
4679     ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
4680     ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
4681     OptimizationRemarkEmitter &ORE) {
4682   // Now do any promotion required for cloning. Specifically, for each
4683   // recorded ICP candidate (which was only recorded because one clone of that
4684   // candidate should call a cloned target), we perform ICP (speculative
4685   // devirtualization) for each clone of the callsite, and update its callee
4686   // to the appropriate clone. Note that the ICP compares against the original
4687   // version of the target, which is what is in the vtable.
4688   for (auto &Info : ICallAnalysisInfo) {
4689     auto *CB = Info.CB;
4690     auto CallsiteIndex = Info.CallsiteInfoStartIndex;
4691     auto TotalCount = Info.TotalCount;
4692     unsigned NumPromoted = 0;
4693     unsigned NumClones = 0;
4694 
4695     for (auto &Candidate : Info.CandidateProfileData) {
4696       auto &StackNode = AllCallsites[CallsiteIndex++];
4697 
4698       // All calls in the same function must have the same number of clones.
4699       assert(!NumClones || NumClones == StackNode.Clones.size());
4700       NumClones = StackNode.Clones.size();
4701 
4702       // See if the target is in the module. If it wasn't imported, it is
4703       // possible that this profile could have been collected on a different
4704       // target (or version of the code), and we need to be conservative
4705       // (similar to what is done in the ICP pass).
4706       Function *TargetFunction = Symtab->getFunction(Candidate.Value);
4707       if (TargetFunction == nullptr ||
4708           // Any ThinLTO global dead symbol removal should have already
4709           // occurred, so it should be safe to promote when the target is a
4710           // declaration.
4711           // TODO: Remove internal option once more fully tested.
4712           (MemProfRequireDefinitionForPromotion &&
4713            TargetFunction->isDeclaration())) {
4714         ORE.emit([&]() {
4715           return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", CB)
4716                  << "Memprof cannot promote indirect call: target with md5sum "
4717                  << ore::NV("target md5sum", Candidate.Value) << " not found";
4718         });
4719         // FIXME: See if we can use the new declaration importing support to
4720         // at least get the declarations imported for this case. Hot indirect
4721         // targets should have been imported normally, however.
4722         continue;
4723       }
4724 
4725       // Check if legal to promote
4726       const char *Reason = nullptr;
4727       if (!isLegalToPromote(*CB, TargetFunction, &Reason)) {
4728         ORE.emit([&]() {
4729           return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB)
4730                  << "Memprof cannot promote indirect call to "
4731                  << ore::NV("TargetFunction", TargetFunction)
4732                  << " with count of " << ore::NV("TotalCount", TotalCount)
4733                  << ": " << Reason;
4734         });
4735         continue;
4736       }
4737 
4738       assert(!isMemProfClone(*TargetFunction));
4739 
4740       // Handle each call clone, applying ICP so that each clone directly
4741       // calls the specified callee clone, guarded by the appropriate ICP
4742       // check.
4743       CallBase *CBClone = CB;
4744       for (unsigned J = 0; J < NumClones; J++) {
4745         // Copy 0 is the original function.
4746         if (J > 0)
4747           CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
4748         // We do the promotion using the original name, so that the comparison
4749         // is against the name in the vtable. Then just below, change the new
4750         // direct call to call the cloned function.
4751         auto &DirectCall =
4752             pgo::promoteIndirectCall(*CBClone, TargetFunction, Candidate.Count,
4753                                      TotalCount, isSamplePGO, &ORE);
4754         auto *TargetToUse = TargetFunction;
4755         // Call original if this version calls the original version of its
4756         // callee.
4757         if (StackNode.Clones[J]) {
4758           TargetToUse =
4759               cast<Function>(M.getOrInsertFunction(
4760                                   getMemProfFuncName(TargetFunction->getName(),
4761                                                      StackNode.Clones[J]),
4762                                   TargetFunction->getFunctionType())
4763                                  .getCallee());
4764         }
4765         DirectCall.setCalledFunction(TargetToUse);
4766         ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
4767                  << ore::NV("Call", CBClone) << " in clone "
4768                  << ore::NV("Caller", CBClone->getFunction())
4769                  << " promoted and assigned to call function clone "
4770                  << ore::NV("Callee", TargetToUse));
4771       }
4772 
4773       // Update TotalCount (all clones should get same count above)
4774       TotalCount -= Candidate.Count;
4775       NumPromoted++;
4776     }
4777     // Adjust the MD.prof metadata for all clones, now that we have the new
4778     // TotalCount and the number promoted.
4779     CallBase *CBClone = CB;
4780     for (unsigned J = 0; J < NumClones; J++) {
4781       // Copy 0 is the original function.
4782       if (J > 0)
4783         CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
4784       // First delete the old one.
4785       CBClone->setMetadata(LLVMContext::MD_prof, nullptr);
4786       // If all promoted, we don't need the MD.prof metadata.
4787       // Otherwise we need update with the un-promoted records back.
4788       if (TotalCount != 0)
4789         annotateValueSite(
4790             M, *CBClone, ArrayRef(Info.CandidateProfileData).slice(NumPromoted),
4791             TotalCount, IPVK_IndirectCallTarget, Info.NumCandidates);
4792     }
4793   }
4794 }
4795 
4796 template <typename DerivedCCG, typename FuncTy, typename CallTy>
4797 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
4798   if (DumpCCG) {
4799     dbgs() << "CCG before cloning:\n";
4800     dbgs() << *this;
4801   }
4802   if (ExportToDot)
4803     exportToDot("postbuild");
4804 
4805   if (VerifyCCG) {
4806     check();
4807   }
4808 
4809   identifyClones();
4810 
4811   if (VerifyCCG) {
4812     check();
4813   }
4814 
4815   if (DumpCCG) {
4816     dbgs() << "CCG after cloning:\n";
4817     dbgs() << *this;
4818   }
4819   if (ExportToDot)
4820     exportToDot("cloned");
4821 
4822   bool Changed = assignFunctions();
4823 
4824   if (DumpCCG) {
4825     dbgs() << "CCG after assigning function clones:\n";
4826     dbgs() << *this;
4827   }
4828   if (ExportToDot)
4829     exportToDot("clonefuncassign");
4830 
4831   if (MemProfReportHintedSizes)
4832     printTotalSizes(errs());
4833 
4834   return Changed;
4835 }
4836 
4837 bool MemProfContextDisambiguation::processModule(
4838     Module &M,
4839     llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
4840 
4841   // If we have an import summary, then the cloning decisions were made during
4842   // the thin link on the index. Apply them and return.
4843   if (ImportSummary)
4844     return applyImport(M);
4845 
4846   // TODO: If/when other types of memprof cloning are enabled beyond just for
4847   // hot and cold, we will need to change this to individually control the
4848   // AllocationType passed to addStackNodesForMIB during CCG construction.
4849   // Note that we specifically check this after applying imports above, so that
4850   // the option isn't needed to be passed to distributed ThinLTO backend
4851   // clang processes, which won't necessarily have visibility into the linker
4852   // dependences. Instead the information is communicated from the LTO link to
4853   // the backends via the combined summary index.
4854   if (!SupportsHotColdNew)
4855     return false;
4856 
4857   ModuleCallsiteContextGraph CCG(M, OREGetter);
4858   return CCG.process();
4859 }
4860 
4861 MemProfContextDisambiguation::MemProfContextDisambiguation(
4862     const ModuleSummaryIndex *Summary, bool isSamplePGO)
4863     : ImportSummary(Summary), isSamplePGO(isSamplePGO) {
4864   if (ImportSummary) {
4865     // The MemProfImportSummary should only be used for testing ThinLTO
4866     // distributed backend handling via opt, in which case we don't have a
4867     // summary from the pass pipeline.
4868     assert(MemProfImportSummary.empty());
4869     return;
4870   }
4871   if (MemProfImportSummary.empty())
4872     return;
4873 
4874   auto ReadSummaryFile =
4875       errorOrToExpected(MemoryBuffer::getFile(MemProfImportSummary));
4876   if (!ReadSummaryFile) {
4877     logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
4878                           "Error loading file '" + MemProfImportSummary +
4879                               "': ");
4880     return;
4881   }
4882   auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
4883   if (!ImportSummaryForTestingOrErr) {
4884     logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
4885                           "Error parsing file '" + MemProfImportSummary +
4886                               "': ");
4887     return;
4888   }
4889   ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
4890   ImportSummary = ImportSummaryForTesting.get();
4891 }
4892 
4893 PreservedAnalyses MemProfContextDisambiguation::run(Module &M,
4894                                                     ModuleAnalysisManager &AM) {
4895   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
4896   auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
4897     return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
4898   };
4899   if (!processModule(M, OREGetter))
4900     return PreservedAnalyses::all();
4901   return PreservedAnalyses::none();
4902 }
4903 
4904 void MemProfContextDisambiguation::run(
4905     ModuleSummaryIndex &Index,
4906     llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
4907         isPrevailing) {
4908   // TODO: If/when other types of memprof cloning are enabled beyond just for
4909   // hot and cold, we will need to change this to individually control the
4910   // AllocationType passed to addStackNodesForMIB during CCG construction.
4911   // The index was set from the option, so these should be in sync.
4912   assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
4913   if (!SupportsHotColdNew)
4914     return;
4915 
4916   IndexCallsiteContextGraph CCG(Index, isPrevailing);
4917   CCG.process();
4918 }
4919