xref: /llvm-project/llvm/include/llvm/Analysis/IRSimilarityIdentifier.h (revision 0da2ba811ac8a01509bc533428941fb9519c0715)
1 //===- IRSimilarityIdentifier.h - Find similarity in a module --------------==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // Interface file for the IRSimilarityIdentifier for identifying similarities in
11 // IR including the IRInstructionMapper, which maps an Instruction to unsigned
12 // integers.
13 //
14 // Two sequences of instructions are called "similar" if they perform the same
15 // series of operations for all inputs.
16 //
17 // \code
18 // %1 = add i32 %a, 10
19 // %2 = add i32 %a, %1
20 // %3 = icmp slt icmp %1, %2
21 // \endcode
22 //
23 // and
24 //
25 // \code
26 // %1 = add i32 11, %a
27 // %2 = sub i32 %a, %1
28 // %3 = icmp sgt icmp %2, %1
29 // \endcode
30 //
31 // ultimately have the same result, even if the inputs, and structure are
32 // slightly different.
33 //
34 // For instructions, we do not worry about operands that do not have fixed
35 // semantic meaning to the program.  We consider the opcode that the instruction
36 // has, the types, parameters, and extra information such as the function name,
37 // or comparison predicate.  These are used to create a hash to map instructions
38 // to integers to be used in similarity matching in sequences of instructions
39 //
40 // Terminology:
41 // An IRSimilarityCandidate is a region of IRInstructionData (wrapped
42 // Instructions), usually used to denote a region of similarity has been found.
43 //
44 // A SimilarityGroup is a set of IRSimilarityCandidates that are structurally
45 // similar to one another.
46 //
47 //===----------------------------------------------------------------------===//
48 
49 #ifndef LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
50 #define LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
51 
52 #include "llvm/IR/InstVisitor.h"
53 #include "llvm/IR/Instructions.h"
54 #include "llvm/IR/PassManager.h"
55 #include "llvm/Pass.h"
56 #include "llvm/Support/Allocator.h"
57 #include <optional>
58 
59 namespace llvm {
60 
61 namespace IRSimilarity {
62 
63 struct IRInstructionDataList;
64 
65 /// This represents what is and is not supported when finding similarity in
66 /// Instructions.
67 ///
68 /// Legal Instructions are considered when looking at similarity between
69 /// Instructions.
70 ///
71 /// Illegal Instructions cannot be considered when looking for similarity
72 /// between Instructions. They act as boundaries between similarity regions.
73 ///
74 /// Invisible Instructions are skipped over during analysis.
75 // TODO: Shared with MachineOutliner
76 enum InstrType { Legal, Illegal, Invisible };
77 
78 /// This provides the utilities for hashing an Instruction to an unsigned
79 /// integer. Two IRInstructionDatas produce the same hash value when their
80 /// underlying Instructions perform the same operation (even if they don't have
81 /// the same input operands.)
82 /// As a more concrete example, consider the following:
83 ///
84 /// \code
85 /// %add1 = add i32 %a, %b
86 /// %add2 = add i32 %c, %d
87 /// %add3 = add i64 %e, %f
88 /// \endcode
89 ///
90 // Then the IRInstructionData wrappers for these Instructions may be hashed like
91 /// so:
92 ///
93 /// \code
94 /// ; These two adds have the same types and operand types, so they hash to the
95 /// ; same number.
96 /// %add1 = add i32 %a, %b ; Hash: 1
97 /// %add2 = add i32 %c, %d ; Hash: 1
98 /// ; This add produces an i64. This differentiates it from %add1 and %add2. So,
99 /// ; it hashes to a different number.
100 /// %add3 = add i64 %e, %f; Hash: 2
101 /// \endcode
102 ///
103 ///
104 /// This hashing scheme will be used to represent the program as a very long
105 /// string. This string can then be placed in a data structure which can be used
106 /// for similarity queries.
107 ///
108 /// TODO: Handle types of Instructions which can be equal even with different
109 /// operands. (E.g. comparisons with swapped predicates.)
110 /// TODO: Handle CallInsts, which are only checked for function type
111 /// by \ref isSameOperationAs.
112 /// TODO: Handle GetElementPtrInsts, as some of the operands have to be the
113 /// exact same, and some do not.
114 struct IRInstructionData
115     : ilist_node<IRInstructionData, ilist_sentinel_tracking<true>> {
116 
117   /// The source Instruction that is being wrapped.
118   Instruction *Inst = nullptr;
119   /// The values of the operands in the Instruction.
120   SmallVector<Value *, 4> OperVals;
121   /// The legality of the wrapped instruction. This is informed by InstrType,
122   /// and is used when checking when two instructions are considered similar.
123   /// If either instruction is not legal, the instructions are automatically not
124   /// considered similar.
125   bool Legal = false;
126 
127   /// This is only relevant if we are wrapping a CmpInst where we needed to
128   /// change the predicate of a compare instruction from a greater than form
129   /// to a less than form.  It is std::nullopt otherwise.
130   std::optional<CmpInst::Predicate> RevisedPredicate;
131 
132   /// This is only relevant if we are wrapping a CallInst. If we are requiring
133   /// that the function calls have matching names as well as types, and the
134   /// call is not an indirect call, this will hold the name of the function.  If
135   /// it is an indirect string, it will be the empty string.  However, if this
136   /// requirement is not in place it will be the empty string regardless of the
137   /// function call type.  The value held here is used to create the hash of the
138   /// instruction, and check to make sure two instructions are close to one
139   /// another.
140   std::optional<std::string> CalleeName;
141 
142   /// This structure holds the distances of how far "ahead of" or "behind" the
143   /// target blocks of a branch, or the incoming blocks of a phi nodes are.
144   /// If the value is negative, it means that the block was registered before
145   /// the block of this instruction in terms of blocks in the function.
146   /// Code Example:
147   /// \code
148   /// block_1:
149   ///   br i1 %0, label %block_2, label %block_3
150   /// block_2:
151   ///   br i1 %1, label %block_1, label %block_2
152   /// block_3:
153   ///   br i1 %2, label %block_2, label %block_1
154   /// ; Replacing the labels with relative values, this becomes:
155   /// block_1:
156   ///   br i1 %0, distance 1, distance 2
157   /// block_2:
158   ///   br i1 %1, distance -1, distance 0
159   /// block_3:
160   ///   br i1 %2, distance -1, distance -2
161   /// \endcode
162   /// Taking block_2 as our example, block_1 is "behind" block_2, and block_2 is
163   /// "ahead" of block_2.
164   SmallVector<int, 4> RelativeBlockLocations;
165 
166   /// Gather the information that is difficult to gather for an Instruction, or
167   /// is changed. i.e. the operands of an Instruction and the Types of those
168   /// operands. This extra information allows for similarity matching to make
169   /// assertions that allow for more flexibility when checking for whether an
170   /// Instruction performs the same operation.
171   IRInstructionData(Instruction &I, bool Legality, IRInstructionDataList &IDL);
172   IRInstructionData(IRInstructionDataList &IDL);
173 
174   /// Fills data stuctures for IRInstructionData when it is constructed from a
175   // reference or a pointer.
176   void initializeInstruction();
177 
178   /// Get the predicate that the compare instruction is using for hashing the
179   /// instruction. the IRInstructionData must be wrapping a CmpInst.
180   CmpInst::Predicate getPredicate() const;
181 
182   /// Get the callee name that the call instruction is using for hashing the
183   /// instruction. The IRInstructionData must be wrapping a CallInst.
184   StringRef getCalleeName() const;
185 
186   /// A function that swaps the predicates to their less than form if they are
187   /// in a greater than form. Otherwise, the predicate is unchanged.
188   ///
189   /// \param CI - The comparison operation to find a consistent preidcate for.
190   /// \return the consistent comparison predicate.
191   static CmpInst::Predicate predicateForConsistency(CmpInst *CI);
192 
193   /// For an IRInstructionData containing a branch, finds the
194   /// relative distances from the source basic block to the target by taking
195   /// the difference of the number assigned to the current basic block and the
196   /// target basic block of the branch.
197   ///
198   /// \param BasicBlockToInteger - The mapping of basic blocks to their location
199   /// in the module.
200   void
201   setBranchSuccessors(DenseMap<BasicBlock *, unsigned> &BasicBlockToInteger);
202 
203   /// For an IRInstructionData containing a CallInst, set the function name
204   /// appropriately.  This will be an empty string if it is an indirect call,
205   /// or we are not matching by name of the called function.  It will be the
206   /// name of the function if \p MatchByName is true and it is not an indirect
207   /// call.  We may decide not to match by name in order to expand the
208   /// size of the regions we can match.  If a function name has the same type
209   /// signature, but the different name, the region of code is still almost the
210   /// same.  Since function names can be treated as constants, the name itself
211   /// could be extrapolated away.  However, matching by name provides a
212   /// specificity and more "identical" code than not matching by name.
213   ///
214   /// \param MatchByName - A flag to mark whether we are using the called
215   /// function name as a differentiating parameter.
216   void setCalleeName(bool MatchByName = true);
217 
218   /// For an IRInstructionData containing a PHINode, finds the
219   /// relative distances from the incoming basic block to the current block by
220   /// taking the difference of the number assigned to the current basic block
221   /// and the incoming basic block of the branch.
222   ///
223   /// \param BasicBlockToInteger - The mapping of basic blocks to their location
224   /// in the module.
225   void
226   setPHIPredecessors(DenseMap<BasicBlock *, unsigned> &BasicBlockToInteger);
227 
228   /// Get the BasicBlock based operands for PHINodes and BranchInsts.
229   ///
230   /// \returns A list of relevant BasicBlocks.
231   ArrayRef<Value *> getBlockOperVals();
232 
233   /// Hashes \p Value based on its opcode, types, and operand types.
234   /// Two IRInstructionData instances produce the same hash when they perform
235   /// the same operation.
236   ///
237   /// As a simple example, consider the following instructions.
238   ///
239   /// \code
240   /// %add1 = add i32 %x1, %y1
241   /// %add2 = add i32 %x2, %y2
242   ///
243   /// %sub = sub i32 %x1, %y1
244   ///
245   /// %add_i64 = add i64 %x2, %y2
246   /// \endcode
247   ///
248   /// Because the first two adds operate the same types, and are performing the
249   /// same action, they will be hashed to the same value.
250   ///
251   /// However, the subtraction instruction is not the same as an addition, and
252   /// will be hashed to a different value.
253   ///
254   /// Finally, the last add has a different type compared to the first two add
255   /// instructions, so it will also be hashed to a different value that any of
256   /// the previous instructions.
257   ///
258   /// \param [in] ID - The IRInstructionData instance to be hashed.
259   /// \returns A hash_value of the IRInstructionData.
260   friend hash_code hash_value(const IRInstructionData &ID) {
261     SmallVector<Type *, 4> OperTypes;
262     for (Value *V : ID.OperVals)
263       OperTypes.push_back(V->getType());
264 
265     if (isa<CmpInst>(ID.Inst))
266       return llvm::hash_combine(
267           llvm::hash_value(ID.Inst->getOpcode()),
268           llvm::hash_value(ID.Inst->getType()),
269           llvm::hash_value(ID.getPredicate()),
270           llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
271 
272     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(ID.Inst)) {
273       // To hash intrinsics, we use the opcode, and types like the other
274       // instructions, but also, the Intrinsic ID, and the Name of the
275       // intrinsic.
276       Intrinsic::ID IntrinsicID = II->getIntrinsicID();
277       return llvm::hash_combine(
278           llvm::hash_value(ID.Inst->getOpcode()),
279           llvm::hash_value(ID.Inst->getType()), llvm::hash_value(IntrinsicID),
280           llvm::hash_value(*ID.CalleeName),
281           llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
282     }
283 
284     if (isa<CallInst>(ID.Inst)) {
285       std::string FunctionName = *ID.CalleeName;
286       return llvm::hash_combine(
287           llvm::hash_value(ID.Inst->getOpcode()),
288           llvm::hash_value(ID.Inst->getType()),
289           llvm::hash_value(ID.Inst->getType()), llvm::hash_value(FunctionName),
290           llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
291     }
292 
293     return llvm::hash_combine(
294         llvm::hash_value(ID.Inst->getOpcode()),
295         llvm::hash_value(ID.Inst->getType()),
296         llvm::hash_combine_range(OperTypes.begin(), OperTypes.end()));
297   }
298 
299   IRInstructionDataList *IDL = nullptr;
300 };
301 
302 struct IRInstructionDataList
303     : simple_ilist<IRInstructionData, ilist_sentinel_tracking<true>> {};
304 
305 /// Compare one IRInstructionData class to another IRInstructionData class for
306 /// whether they are performing a the same operation, and can mapped to the
307 /// same value. For regular instructions if the hash value is the same, then
308 /// they will also be close.
309 ///
310 /// \param A - The first IRInstructionData class to compare
311 /// \param B - The second IRInstructionData class to compare
312 /// \returns true if \p A and \p B are similar enough to be mapped to the same
313 /// value.
314 bool isClose(const IRInstructionData &A, const IRInstructionData &B);
315 
316 struct IRInstructionDataTraits : DenseMapInfo<IRInstructionData *> {
317   static inline IRInstructionData *getEmptyKey() { return nullptr; }
318   static inline IRInstructionData *getTombstoneKey() {
319     return reinterpret_cast<IRInstructionData *>(-1);
320   }
321 
322   static unsigned getHashValue(const IRInstructionData *E) {
323     using llvm::hash_value;
324     assert(E && "IRInstructionData is a nullptr?");
325     return hash_value(*E);
326   }
327 
328   static bool isEqual(const IRInstructionData *LHS,
329                       const IRInstructionData *RHS) {
330     if (RHS == getEmptyKey() || RHS == getTombstoneKey() ||
331         LHS == getEmptyKey() || LHS == getTombstoneKey())
332       return LHS == RHS;
333 
334     assert(LHS && RHS && "nullptr should have been caught by getEmptyKey?");
335     return isClose(*LHS, *RHS);
336   }
337 };
338 
339 /// Helper struct for converting the Instructions in a Module into a vector of
340 /// unsigned integers. This vector of unsigned integers can be thought of as a
341 /// "numeric string". This numeric string can then be queried by, for example,
342 /// data structures that find repeated substrings.
343 ///
344 /// This hashing is done per BasicBlock in the module. To hash Instructions
345 /// based off of their operations, each Instruction is wrapped in an
346 /// IRInstructionData struct. The unsigned integer for an IRInstructionData
347 /// depends on:
348 /// - The hash provided by the IRInstructionData.
349 /// - Which member of InstrType the IRInstructionData is classified as.
350 // See InstrType for more details on the possible classifications, and how they
351 // manifest in the numeric string.
352 ///
353 /// The numeric string for an individual BasicBlock is terminated by an unique
354 /// unsigned integer. This prevents data structures which rely on repetition
355 /// from matching across BasicBlocks. (For example, the SuffixTree.)
356 /// As a concrete example, if we have the following two BasicBlocks:
357 /// \code
358 /// bb0:
359 /// %add1 = add i32 %a, %b
360 /// %add2 = add i32 %c, %d
361 /// %add3 = add i64 %e, %f
362 /// bb1:
363 /// %sub = sub i32 %c, %d
364 /// \endcode
365 /// We may hash the Instructions like this (via IRInstructionData):
366 /// \code
367 /// bb0:
368 /// %add1 = add i32 %a, %b ; Hash: 1
369 /// %add2 = add i32 %c, %d; Hash: 1
370 /// %add3 = add i64 %e, %f; Hash: 2
371 /// bb1:
372 /// %sub = sub i32 %c, %d; Hash: 3
373 /// %add4 = add i32 %c, %d ; Hash: 1
374 /// \endcode
375 /// And produce a "numeric string representation" like so:
376 /// 1, 1, 2, unique_integer_1, 3, 1, unique_integer_2
377 ///
378 /// TODO: This is very similar to the MachineOutliner, and should be
379 /// consolidated into the same interface.
380 struct IRInstructionMapper {
381   /// The starting illegal instruction number to map to.
382   ///
383   /// Set to -3 for compatibility with DenseMapInfo<unsigned>.
384   unsigned IllegalInstrNumber = static_cast<unsigned>(-3);
385 
386   /// The next available integer to assign to a legal Instruction to.
387   unsigned LegalInstrNumber = 0;
388 
389   /// Correspondence from IRInstructionData to unsigned integers.
390   DenseMap<IRInstructionData *, unsigned, IRInstructionDataTraits>
391       InstructionIntegerMap;
392 
393   /// A mapping for a basic block in a module to its assigned number/location
394   /// in the module.
395   DenseMap<BasicBlock *, unsigned> BasicBlockToInteger;
396 
397   /// Set if we added an illegal number in the previous step.
398   /// Since each illegal number is unique, we only need one of them between
399   /// each range of legal numbers. This lets us make sure we don't add more
400   /// than one illegal number per range.
401   bool AddedIllegalLastTime = false;
402 
403   /// Marks whether we found a illegal instruction in the previous step.
404   bool CanCombineWithPrevInstr = false;
405 
406   /// Marks whether we have found a set of instructions that is long enough
407   /// to be considered for similarity.
408   bool HaveLegalRange = false;
409 
410   /// Marks whether we should use exact function names, as well as types to
411   /// find similarity between calls.
412   bool EnableMatchCallsByName = false;
413 
414   /// This allocator pointer is in charge of holding on to the IRInstructionData
415   /// so it is not deallocated until whatever external tool is using it is done
416   /// with the information.
417   SpecificBumpPtrAllocator<IRInstructionData> *InstDataAllocator = nullptr;
418 
419   /// This allocator pointer is in charge of creating the IRInstructionDataList
420   /// so it is not deallocated until whatever external tool is using it is done
421   /// with the information.
422   SpecificBumpPtrAllocator<IRInstructionDataList> *IDLAllocator = nullptr;
423 
424   /// Get an allocated IRInstructionData struct using the InstDataAllocator.
425   ///
426   /// \param I - The Instruction to wrap with IRInstructionData.
427   /// \param Legality - A boolean value that is true if the instruction is to
428   /// be considered for similarity, and false if not.
429   /// \param IDL - The InstructionDataList that the IRInstructionData is
430   /// inserted into.
431   /// \returns An allocated IRInstructionData struct.
432   IRInstructionData *allocateIRInstructionData(Instruction &I, bool Legality,
433                                                IRInstructionDataList &IDL);
434 
435   /// Get an empty allocated IRInstructionData struct using the
436   /// InstDataAllocator.
437   ///
438   /// \param IDL - The InstructionDataList that the IRInstructionData is
439   /// inserted into.
440   /// \returns An allocated IRInstructionData struct.
441   IRInstructionData *allocateIRInstructionData(IRInstructionDataList &IDL);
442 
443   /// Get an allocated IRInstructionDataList object using the IDLAllocator.
444   ///
445   /// \returns An allocated IRInstructionDataList object.
446   IRInstructionDataList *allocateIRInstructionDataList();
447 
448   IRInstructionDataList *IDL = nullptr;
449 
450   /// Assigns values to all the basic blocks in function \p F starting from
451   /// integer \p BBNumber.
452   ///
453   /// \param F - The function containing the basic blocks to assign numbers to.
454   /// \param BBNumber - The number to start from.
455   void initializeForBBs(Function &F, unsigned &BBNumber) {
456     for (BasicBlock &BB : F)
457       BasicBlockToInteger.insert(std::make_pair(&BB, BBNumber++));
458   }
459 
460   /// Assigns values to all the basic blocks in Module \p M.
461   /// \param M - The module containing the basic blocks to assign numbers to.
462   void initializeForBBs(Module &M) {
463     unsigned BBNumber = 0;
464     for (Function &F : M)
465       initializeForBBs(F, BBNumber);
466   }
467 
468   /// Maps the Instructions in a BasicBlock \p BB to legal or illegal integers
469   /// determined by \p InstrType. Two Instructions are mapped to the same value
470   /// if they are close as defined by the InstructionData class above.
471   ///
472   /// \param [in] BB - The BasicBlock to be mapped to integers.
473   /// \param [in,out] InstrList - Vector of IRInstructionData to append to.
474   /// \param [in,out] IntegerMapping - Vector of unsigned integers to append to.
475   void convertToUnsignedVec(BasicBlock &BB,
476                             std::vector<IRInstructionData *> &InstrList,
477                             std::vector<unsigned> &IntegerMapping);
478 
479   /// Maps an Instruction to a legal integer.
480   ///
481   /// \param [in] It - The Instruction to be mapped to an integer.
482   /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to
483   /// append to.
484   /// \param [in,out] InstrListForBB - Vector of InstructionData to append to.
485   /// \returns The integer \p It was mapped to.
486   unsigned mapToLegalUnsigned(BasicBlock::iterator &It,
487                               std::vector<unsigned> &IntegerMappingForBB,
488                               std::vector<IRInstructionData *> &InstrListForBB);
489 
490   /// Maps an Instruction to an illegal integer.
491   ///
492   /// \param [in] It - The \p Instruction to be mapped to an integer.
493   /// \param [in,out] IntegerMappingForBB - Vector of unsigned integers to
494   /// append to.
495   /// \param [in,out] InstrListForBB - Vector of IRInstructionData to append to.
496   /// \param End - true if creating a dummy IRInstructionData at the end of a
497   /// basic block.
498   /// \returns The integer \p It was mapped to.
499   unsigned mapToIllegalUnsigned(
500       BasicBlock::iterator &It, std::vector<unsigned> &IntegerMappingForBB,
501       std::vector<IRInstructionData *> &InstrListForBB, bool End = false);
502 
503   IRInstructionMapper(SpecificBumpPtrAllocator<IRInstructionData> *IDA,
504                       SpecificBumpPtrAllocator<IRInstructionDataList> *IDLA)
505       : InstDataAllocator(IDA), IDLAllocator(IDLA) {
506     // Make sure that the implementation of DenseMapInfo<unsigned> hasn't
507     // changed.
508     assert(DenseMapInfo<unsigned>::getEmptyKey() == static_cast<unsigned>(-1) &&
509            "DenseMapInfo<unsigned>'s empty key isn't -1!");
510     assert(DenseMapInfo<unsigned>::getTombstoneKey() ==
511                static_cast<unsigned>(-2) &&
512            "DenseMapInfo<unsigned>'s tombstone key isn't -2!");
513 
514     IDL = new (IDLAllocator->Allocate())
515         IRInstructionDataList();
516   }
517 
518   /// Custom InstVisitor to classify different instructions for whether it can
519   /// be analyzed for similarity.
520   struct InstructionClassification
521       : public InstVisitor<InstructionClassification, InstrType> {
522     InstructionClassification() = default;
523 
524     // TODO: Determine a scheme to resolve when the label is similar enough.
525     InstrType visitBranchInst(BranchInst &BI) {
526       if (EnableBranches)
527         return Legal;
528       return Illegal;
529     }
530     InstrType visitPHINode(PHINode &PN) {
531       if (EnableBranches)
532         return Legal;
533       return Illegal;
534     }
535     // TODO: Handle allocas.
536     InstrType visitAllocaInst(AllocaInst &AI) { return Illegal; }
537     // We exclude variable argument instructions since variable arguments
538     // requires extra checking of the argument list.
539     InstrType visitVAArgInst(VAArgInst &VI) { return Illegal; }
540     // We exclude all exception handling cases since they are so context
541     // dependent.
542     InstrType visitLandingPadInst(LandingPadInst &LPI) { return Illegal; }
543     InstrType visitFuncletPadInst(FuncletPadInst &FPI) { return Illegal; }
544     // DebugInfo should be included in the regions, but should not be
545     // analyzed for similarity as it has no bearing on the outcome of the
546     // program.
547     InstrType visitDbgInfoIntrinsic(DbgInfoIntrinsic &DII) { return Invisible; }
548     InstrType visitIntrinsicInst(IntrinsicInst &II) {
549       // These are disabled due to complications in the CodeExtractor when
550       // outlining these instructions.  For instance, It is unclear what we
551       // should do when moving only the start or end lifetime instruction into
552       // an outlined function. Also, assume-like intrinsics could be removed
553       // from the region, removing arguments, causing discrepencies in the
554       // number of inputs between different regions.
555       if (II.isAssumeLikeIntrinsic())
556         return Illegal;
557       return EnableIntrinsics ? Legal : Illegal;
558     }
559     // We only allow call instructions where the function has a name and
560     // is not an indirect call.
561     InstrType visitCallInst(CallInst &CI) {
562       Function *F = CI.getCalledFunction();
563       bool IsIndirectCall = CI.isIndirectCall();
564       if (IsIndirectCall && !EnableIndirectCalls)
565         return Illegal;
566       if (!F && !IsIndirectCall)
567         return Illegal;
568       // Functions marked with the swifttailcc and tailcc calling conventions
569       // require special handling when outlining musttail functions.  The
570       // calling convention must be passed down to the outlined function as
571       // well. Further, there is special handling for musttail calls as well,
572       // requiring a return call directly after.  For now, the outliner does not
573       // support this, so we do not handle matching this case either.
574       if ((CI.getCallingConv() == CallingConv::SwiftTail ||
575            CI.getCallingConv() == CallingConv::Tail) &&
576           !EnableMustTailCalls)
577         return Illegal;
578       if (CI.isMustTailCall() && !EnableMustTailCalls)
579         return Illegal;
580       return Legal;
581     }
582     // TODO: We do not current handle similarity that changes the control flow.
583     InstrType visitInvokeInst(InvokeInst &II) { return Illegal; }
584     // TODO: We do not current handle similarity that changes the control flow.
585     InstrType visitCallBrInst(CallBrInst &CBI) { return Illegal; }
586     // TODO: Handle interblock similarity.
587     InstrType visitTerminator(Instruction &I) { return Illegal; }
588     InstrType visitInstruction(Instruction &I) { return Legal; }
589 
590     // The flag variable that lets the classifier know whether we should
591     // allow branches to be checked for similarity.
592     bool EnableBranches = false;
593 
594     // The flag variable that lets the classifier know whether we should
595     // allow indirect calls to be considered legal instructions.
596     bool EnableIndirectCalls = false;
597 
598     // Flag that lets the classifier know whether we should allow intrinsics to
599     // be checked for similarity.
600     bool EnableIntrinsics = false;
601 
602     // Flag that lets the classifier know whether we should allow tail calls to
603     // be checked for similarity.
604     bool EnableMustTailCalls = false;
605   };
606 
607   /// Maps an Instruction to a member of InstrType.
608   InstructionClassification InstClassifier;
609 };
610 
611 /// This is a class that wraps a range of IRInstructionData from one point to
612 /// another in the vector of IRInstructionData, which is a region of the
613 /// program.  It is also responsible for defining the structure within this
614 /// region of instructions.
615 ///
616 /// The structure of a region is defined through a value numbering system
617 /// assigned to each unique value in a region at the creation of the
618 /// IRSimilarityCandidate.
619 ///
620 /// For example, for each Instruction we add a mapping for each new
621 /// value seen in that Instruction.
622 /// IR:                    Mapping Added:
623 /// %add1 = add i32 %a, c1    %add1 -> 3, %a -> 1, c1 -> 2
624 /// %add2 = add i32 %a, %1    %add2 -> 4
625 /// %add3 = add i32 c2, c1    %add3 -> 6, c2 -> 5
626 ///
627 /// We can compare IRSimilarityCandidates against one another.
628 /// The \ref isSimilar function compares each IRInstructionData against one
629 /// another and if we have the same sequences of IRInstructionData that would
630 /// create the same hash, we have similar IRSimilarityCandidates.
631 ///
632 /// We can also compare the structure of IRSimilarityCandidates. If we can
633 /// create a mapping of registers in the region contained by one
634 /// IRSimilarityCandidate to the region contained by different
635 /// IRSimilarityCandidate, they can be considered structurally similar.
636 ///
637 /// IRSimilarityCandidate1:   IRSimilarityCandidate2:
638 /// %add1 = add i32 %a, %b    %add1 = add i32 %d, %e
639 /// %add2 = add i32 %a, %c    %add2 = add i32 %d, %f
640 /// %add3 = add i32 c1, c2    %add3 = add i32 c3, c4
641 ///
642 /// Can have the following mapping from candidate to candidate of:
643 /// %a -> %d, %b -> %e, %c -> %f, c1 -> c3, c2 -> c4
644 /// and can be considered similar.
645 ///
646 /// IRSimilarityCandidate1:   IRSimilarityCandidate2:
647 /// %add1 = add i32 %a, %b    %add1 = add i32 %d, c4
648 /// %add2 = add i32 %a, %c    %add2 = add i32 %d, %f
649 /// %add3 = add i32 c1, c2    %add3 = add i32 c3, c4
650 ///
651 /// We cannot create the same mapping since the use of c4 is not used in the
652 /// same way as %b or c2.
653 class IRSimilarityCandidate {
654 private:
655   /// The start index of this IRSimilarityCandidate in the instruction list.
656   unsigned StartIdx = 0;
657 
658   /// The number of instructions in this IRSimilarityCandidate.
659   unsigned Len = 0;
660 
661   /// The first instruction in this IRSimilarityCandidate.
662   IRInstructionData *FirstInst = nullptr;
663 
664   /// The last instruction in this IRSimilarityCandidate.
665   IRInstructionData *LastInst = nullptr;
666 
667   /// Global Value Numbering structures
668   /// @{
669   /// Stores the mapping of the value to the number assigned to it in the
670   /// IRSimilarityCandidate.
671   DenseMap<Value *, unsigned> ValueToNumber;
672   /// Stores the mapping of the number to the value assigned this number.
673   DenseMap<unsigned, Value *> NumberToValue;
674   /// Stores the mapping of a value's number to canonical numbering in the
675   /// candidate's respective similarity group.
676   DenseMap<unsigned, unsigned> NumberToCanonNum;
677   /// Stores the mapping of canonical number in the candidate's respective
678   /// similarity group to a value number.
679   DenseMap<unsigned, unsigned> CanonNumToNumber;
680   /// @}
681 
682 public:
683   /// \param StartIdx - The starting location of the region.
684   /// \param Len - The length of the region.
685   /// \param FirstInstIt - The starting IRInstructionData of the region.
686   /// \param LastInstIt - The ending IRInstructionData of the region.
687   IRSimilarityCandidate(unsigned StartIdx, unsigned Len,
688                         IRInstructionData *FirstInstIt,
689                         IRInstructionData *LastInstIt);
690 
691   /// \param A - The first IRInstructionCandidate to compare.
692   /// \param B - The second IRInstructionCandidate to compare.
693   /// \returns True when every IRInstructionData in \p A is similar to every
694   /// IRInstructionData in \p B.
695   static bool isSimilar(const IRSimilarityCandidate &A,
696                         const IRSimilarityCandidate &B);
697 
698   /// \param [in] A - The first IRInstructionCandidate to compare.
699   /// \param [in] B - The second IRInstructionCandidate to compare.
700   /// \returns True when every IRInstructionData in \p A is structurally similar
701   /// to \p B.
702   static bool compareStructure(const IRSimilarityCandidate &A,
703                                const IRSimilarityCandidate &B);
704 
705   /// \param [in] A - The first IRInstructionCandidate to compare.
706   /// \param [in] B - The second IRInstructionCandidate to compare.
707   /// \param [in,out] ValueNumberMappingA - A mapping of value numbers from
708   /// candidate \p A to candidate \B.
709   /// \param [in,out] ValueNumberMappingB - A mapping of value numbers from
710   /// candidate \p B to candidate \A.
711   /// \returns True when every IRInstructionData in \p A is structurally similar
712   /// to \p B.
713   static bool
714   compareStructure(const IRSimilarityCandidate &A,
715                    const IRSimilarityCandidate &B,
716                    DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingA,
717                    DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingB);
718 
719   struct OperandMapping {
720     /// The IRSimilarityCandidate that holds the instruction the OperVals were
721     /// pulled from.
722     const IRSimilarityCandidate &IRSC;
723 
724     /// The operand values to be analyzed.
725     ArrayRef<Value *> &OperVals;
726 
727     /// The current mapping of global value numbers from one IRSimilarityCandidate
728     /// to another IRSimilarityCandidate.
729     DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMapping;
730   };
731 
732   /// A helper struct to hold the candidate, for a branch instruction, the
733   /// relative location of a label, and the label itself.  This is mostly to
734   /// group the values together before passing them as a bundle to a function.
735   struct RelativeLocMapping {
736     /// The IRSimilarityCandidate that holds the instruction the relative
737     /// location was pulled from.
738     const IRSimilarityCandidate &IRSC;
739 
740     /// The relative location to be analyzed.
741     int RelativeLocation;
742 
743     /// The corresponding value.
744     Value *OperVal;
745   };
746 
747   /// Compare the operands in \p A and \p B and check that the current mapping
748   /// of global value numbers from \p A to \p B and \p B to \A is consistent.
749   ///
750   /// \param A - The first IRInstructionCandidate, operand values, and current
751   /// operand mappings to compare.
752   /// \param B - The second IRInstructionCandidate, operand values, and current
753   /// operand mappings to compare.
754   /// \returns true if the IRSimilarityCandidates operands are compatible.
755   static bool compareNonCommutativeOperandMapping(OperandMapping A,
756                                                   OperandMapping B);
757 
758   /// Compare the operands in \p A and \p B and check that the current mapping
759   /// of global value numbers from \p A to \p B and \p B to \A is consistent
760   /// given that the operands are commutative.
761   ///
762   /// \param A - The first IRInstructionCandidate, operand values, and current
763   /// operand mappings to compare.
764   /// \param B - The second IRInstructionCandidate, operand values, and current
765   /// operand mappings to compare.
766   /// \returns true if the IRSimilarityCandidates operands are compatible.
767   static bool compareCommutativeOperandMapping(OperandMapping A,
768                                                OperandMapping B);
769 
770   /// Compare the GVN of the assignment value in corresponding instructions in
771   /// IRSimilarityCandidates \p A and \p B and check that there exists a mapping
772   /// between the values and replaces the mapping with a one-to-one value if
773   /// needed.
774   ///
775   /// \param InstValA - The assignment GVN from the first IRSimilarityCandidate.
776   /// \param InstValB - The assignment GVN from the second
777   /// IRSimilarityCandidate.
778   /// \param [in,out] ValueNumberMappingA - A mapping of value numbers from
779   /// candidate \p A to candidate \B.
780   /// \param [in,out] ValueNumberMappingB - A mapping of value numbers from
781   /// candidate \p B to candidate \A.
782   /// \returns true if the IRSimilarityCandidates assignments are compatible.
783   static bool compareAssignmentMapping(
784       const unsigned InstValA, const unsigned &InstValB,
785       DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingA,
786       DenseMap<unsigned, DenseSet<unsigned>> &ValueNumberMappingB);
787 
788   /// Compare the relative locations in \p A and \p B and check that the
789   /// distances match if both locations are contained in the region, and that
790   /// the branches both point outside the region if they do not.
791   /// Example Region:
792   /// \code
793   /// entry:
794   ///   br i1 %0, label %block_1, label %block_3
795   /// block_0:
796   ///   br i1 %0, label %block_1, label %block_2
797   /// block_1:
798   ///   br i1 %0, label %block_2, label %block_3
799   /// block_2:
800   ///   br i1 %1, label %block_1, label %block_4
801   /// block_3:
802   ///   br i1 %2, label %block_2, label %block_5
803   /// \endcode
804   /// If we compare the branches in block_0 and block_1 the relative values are
805   /// 1 and 2 for both, so we consider this a match.
806   ///
807   /// If we compare the branches in entry and block_0 the relative values are
808   /// 2 and 3, and 1 and 2 respectively.  Since these are not the same we do not
809   /// consider them a match.
810   ///
811   /// If we compare the branches in block_1 and block_2 the relative values are
812   /// 1 and 2, and -1 and None respectively.  As a result we do not consider
813   /// these to be the same
814   ///
815   /// If we compare the branches in block_2 and block_3 the relative values are
816   /// -1 and None for both.  We do consider these to be a match.
817   ///
818   /// \param A - The first IRInstructionCandidate, relative location value,
819   /// and incoming block.
820   /// \param B - The second IRInstructionCandidate, relative location value,
821   /// and incoming block.
822   /// \returns true if the relative locations match.
823   static bool checkRelativeLocations(RelativeLocMapping A,
824                                      RelativeLocMapping B);
825 
826   /// Create a mapping from the value numbering to a different separate set of
827   /// numbers. This will serve as a guide for relating one candidate to another.
828   /// The canonical number gives use the ability identify which global value
829   /// number in one candidate relates to the global value number in the other.
830   ///
831   /// \param [in, out] CurrCand - The IRSimilarityCandidate to create a
832   /// canonical numbering for.
833   static void createCanonicalMappingFor(IRSimilarityCandidate &CurrCand);
834 
835   /// Create a mapping for the value numbering of the calling
836   /// IRSimilarityCandidate, to a different separate set of numbers, based on
837   /// the canonical ordering in \p SourceCand. These are defined based on the
838   /// found mappings in \p ToSourceMapping and \p FromSourceMapping.  Both of
839   /// these relationships should have the same information, just in opposite
840   /// directions.
841   ///
842   /// \param [in, out] SourceCand - The IRSimilarityCandidate to create a
843   /// canonical numbering from.
844   /// \param ToSourceMapping - The mapping of value numbers from this candidate
845   /// to \p SourceCand.
846   /// \param FromSourceMapping - The mapping of value numbers from \p SoureCand
847   /// to this candidate.
848   void createCanonicalRelationFrom(
849       IRSimilarityCandidate &SourceCand,
850       DenseMap<unsigned, DenseSet<unsigned>> &ToSourceMapping,
851       DenseMap<unsigned, DenseSet<unsigned>> &FromSourceMapping);
852 
853   /// Create a mapping for the value numbering of the calling
854   /// IRSimilarityCandidate, to a different separate set of numbers, based on
855   /// the canonical ordering in \p SourceCand. These are defined based on the
856   /// found mappings in \p ToSourceMapping and \p FromSourceMapping.  Both of
857   /// these relationships should have the same information, just in opposite
858   /// directions.  Uses the \p OneToOne mapping from target candidate to \p
859   /// SourceCand GVNs to determine the mapping first for values with multiple
860   /// mappings.  This mapping is created by the ordering of operands in the
861   /// instruction they are first seen in the candidates.
862   ///
863   /// \param [in, out] SourceCand - The IRSimilarityCandidate to create a
864   /// canonical numbering from.
865   /// \param [in,out] OneToOne - A mapping of value numbers from candidate
866   /// \p A to candidate \B using the structure of the original instructions.
867   /// \param ToSourceMapping - The mapping of value numbers from this candidate
868   /// to \p SourceCand.
869   /// \param FromSourceMapping - The mapping of value numbers from \p SoureCand
870   /// to this candidate.
871   void createCanonicalRelationFrom(
872       IRSimilarityCandidate &SourceCand,
873       DenseMap<unsigned, unsigned> &OneToOne,
874       DenseMap<unsigned, DenseSet<unsigned>> &ToSourceMapping,
875       DenseMap<unsigned, DenseSet<unsigned>> &FromSourceMapping);
876 
877   /// Create a mapping for the value numbering of the calling
878   /// IRSimilarityCandidate, to a different separate set of numbers, based on
879   /// the canonical ordering in \p SourceCand. These are defined based on the
880   /// canonical mapping defined between \p SoureCandLarge and
881   /// \p TargetCandLarge.  These IRSimilarityCandidates are already structurally
882   /// similar, and fully encapsulate the IRSimilarityCandidates in question.
883   /// These are used as a "bridge" from the \p SourceCand to the target.
884   ///
885   /// \param [in, out] SourceCand - The IRSimilarityCandidate to create a
886   /// canonical numbering from.
887   /// \param SoureCandLarge - The IRSimilarityCandidate fully containing
888   /// \p SourceCand.
889   /// \param TargetCandLarge -  The IRSimilarityCandidate fully containing
890   /// this Candidate.
891   void createCanonicalRelationFrom(
892       IRSimilarityCandidate &SourceCand,
893       IRSimilarityCandidate &SourceCandLarge,
894       IRSimilarityCandidate &TargetCandLarge);
895 
896   /// \param [in,out] BBSet - The set to track the basic blocks.
897   void getBasicBlocks(DenseSet<BasicBlock *> &BBSet) const {
898     for (IRInstructionData &ID : *this) {
899       BasicBlock *BB = ID.Inst->getParent();
900       BBSet.insert(BB);
901     }
902   }
903 
904   /// \param [in,out] BBSet - The set to track the basic blocks.
905   /// \param [in,out] BBList - A list in order of use to track the basic blocks.
906   void getBasicBlocks(DenseSet<BasicBlock *> &BBSet,
907                       SmallVector<BasicBlock *> &BBList) const {
908     for (IRInstructionData &ID : *this) {
909       BasicBlock *BB = ID.Inst->getParent();
910       if (BBSet.insert(BB).second)
911         BBList.push_back(BB);
912     }
913   }
914 
915   /// Compare the start and end indices of the two IRSimilarityCandidates for
916   /// whether they overlap. If the start instruction of one
917   /// IRSimilarityCandidate is less than the end instruction of the other, and
918   /// the start instruction of one is greater than the start instruction of the
919   /// other, they overlap.
920   ///
921   /// \returns true if the IRSimilarityCandidates do not have overlapping
922   /// instructions.
923   static bool overlap(const IRSimilarityCandidate &A,
924                       const IRSimilarityCandidate &B);
925 
926   /// \returns the number of instructions in this Candidate.
927   unsigned getLength() const { return Len; }
928 
929   /// \returns the start index of this IRSimilarityCandidate.
930   unsigned getStartIdx() const { return StartIdx; }
931 
932   /// \returns the end index of this IRSimilarityCandidate.
933   unsigned getEndIdx() const { return StartIdx + Len - 1; }
934 
935   /// \returns The first IRInstructionData.
936   IRInstructionData *front() const { return FirstInst; }
937   /// \returns The last IRInstructionData.
938   IRInstructionData *back() const { return LastInst; }
939 
940   /// \returns The first Instruction.
941   Instruction *frontInstruction() { return FirstInst->Inst; }
942   /// \returns The last Instruction
943   Instruction *backInstruction() { return LastInst->Inst; }
944 
945   /// \returns The BasicBlock the IRSimilarityCandidate starts in.
946   BasicBlock *getStartBB() { return FirstInst->Inst->getParent(); }
947   /// \returns The BasicBlock the IRSimilarityCandidate ends in.
948   BasicBlock *getEndBB() { return LastInst->Inst->getParent(); }
949 
950   /// \returns The Function that the IRSimilarityCandidate is located in.
951   Function *getFunction() { return getStartBB()->getParent(); }
952 
953   /// Finds the positive number associated with \p V if it has been mapped.
954   /// \param [in] V - the Value to find.
955   /// \returns The positive number corresponding to the value.
956   /// \returns std::nullopt if not present.
957   std::optional<unsigned> getGVN(Value *V) {
958     assert(V != nullptr && "Value is a nullptr?");
959     DenseMap<Value *, unsigned>::iterator VNIt = ValueToNumber.find(V);
960     if (VNIt == ValueToNumber.end())
961       return std::nullopt;
962     return VNIt->second;
963   }
964 
965   /// Finds the Value associate with \p Num if it exists.
966   /// \param [in] Num - the number to find.
967   /// \returns The Value associated with the number.
968   /// \returns std::nullopt if not present.
969   std::optional<Value *> fromGVN(unsigned Num) {
970     DenseMap<unsigned, Value *>::iterator VNIt = NumberToValue.find(Num);
971     if (VNIt == NumberToValue.end())
972       return std::nullopt;
973     assert(VNIt->second != nullptr && "Found value is a nullptr!");
974     return VNIt->second;
975   }
976 
977   /// Find the canonical number from the global value number \p N stored in the
978   /// candidate.
979   ///
980   /// \param N - The global value number to find the canonical number for.
981   /// \returns An optional containing the value, and std::nullopt if it could
982   /// not be found.
983   std::optional<unsigned> getCanonicalNum(unsigned N) {
984     DenseMap<unsigned, unsigned>::iterator NCIt = NumberToCanonNum.find(N);
985     if (NCIt == NumberToCanonNum.end())
986       return std::nullopt;
987     return NCIt->second;
988   }
989 
990   /// Find the global value number from the canonical number \p N stored in the
991   /// candidate.
992   ///
993   /// \param N - The canonical number to find the global vlaue number for.
994   /// \returns An optional containing the value, and std::nullopt if it could
995   /// not be found.
996   std::optional<unsigned> fromCanonicalNum(unsigned N) {
997     DenseMap<unsigned, unsigned>::iterator CNIt = CanonNumToNumber.find(N);
998     if (CNIt == CanonNumToNumber.end())
999       return std::nullopt;
1000     return CNIt->second;
1001   }
1002 
1003   /// \param RHS -The IRSimilarityCandidate to compare against
1004   /// \returns true if the IRSimilarityCandidate is occurs after the
1005   /// IRSimilarityCandidate in the program.
1006   bool operator<(const IRSimilarityCandidate &RHS) const {
1007     return getStartIdx() > RHS.getStartIdx();
1008   }
1009 
1010   using iterator = IRInstructionDataList::iterator;
1011   iterator begin() const { return iterator(front()); }
1012   iterator end() const { return std::next(iterator(back())); }
1013 };
1014 
1015 typedef DenseMap<IRSimilarityCandidate *,
1016                  DenseMap<unsigned, DenseSet<unsigned>>>
1017     CandidateGVNMapping;
1018 typedef std::vector<IRSimilarityCandidate> SimilarityGroup;
1019 typedef std::vector<SimilarityGroup> SimilarityGroupList;
1020 
1021 /// This class puts all the pieces of the IRInstructionData,
1022 /// IRInstructionMapper, IRSimilarityCandidate together.
1023 ///
1024 /// It first feeds the Module or vector of Modules into the IRInstructionMapper,
1025 /// and puts all the mapped instructions into a single long list of
1026 /// IRInstructionData.
1027 ///
1028 /// The list of unsigned integers is given to the Suffix Tree or similar data
1029 /// structure to find repeated subsequences.  We construct an
1030 /// IRSimilarityCandidate for each instance of the subsequence.  We compare them
1031 /// against one another since  These repeated subsequences can have different
1032 /// structure.  For each different kind of structure found, we create a
1033 /// similarity group.
1034 ///
1035 /// If we had four IRSimilarityCandidates A, B, C, and D where A, B and D are
1036 /// structurally similar to one another, while C is different we would have two
1037 /// SimilarityGroups:
1038 ///
1039 /// SimilarityGroup 1:  SimilarityGroup 2
1040 /// A, B, D             C
1041 ///
1042 /// A list of the different similarity groups is then returned after
1043 /// analyzing the module.
1044 class IRSimilarityIdentifier {
1045 public:
1046   IRSimilarityIdentifier(bool MatchBranches = true,
1047                          bool MatchIndirectCalls = true,
1048                          bool MatchCallsWithName = false,
1049                          bool MatchIntrinsics = true,
1050                          bool MatchMustTailCalls = true)
1051       : Mapper(&InstDataAllocator, &InstDataListAllocator),
1052         EnableBranches(MatchBranches), EnableIndirectCalls(MatchIndirectCalls),
1053         EnableMatchingCallsByName(MatchCallsWithName),
1054         EnableIntrinsics(MatchIntrinsics),
1055         EnableMustTailCalls(MatchMustTailCalls) {}
1056 
1057 private:
1058   /// Map the instructions in the module to unsigned integers, using mapping
1059   /// already present in the Mapper if possible.
1060   ///
1061   /// \param [in] M Module - To map to integers.
1062   /// \param [in,out] InstrList - The vector to append IRInstructionData to.
1063   /// \param [in,out] IntegerMapping - The vector to append integers to.
1064   void populateMapper(Module &M, std::vector<IRInstructionData *> &InstrList,
1065                       std::vector<unsigned> &IntegerMapping);
1066 
1067   /// Map the instructions in the modules vector to unsigned integers, using
1068   /// mapping already present in the mapper if possible.
1069   ///
1070   /// \param [in] Modules - The list of modules to use to populate the mapper
1071   /// \param [in,out] InstrList - The vector to append IRInstructionData to.
1072   /// \param [in,out] IntegerMapping - The vector to append integers to.
1073   void populateMapper(ArrayRef<std::unique_ptr<Module>> &Modules,
1074                       std::vector<IRInstructionData *> &InstrList,
1075                       std::vector<unsigned> &IntegerMapping);
1076 
1077   /// Find the similarity candidates in \p InstrList and corresponding
1078   /// \p UnsignedVec
1079   ///
1080   /// \param [in,out] InstrList - The vector to append IRInstructionData to.
1081   /// \param [in,out] IntegerMapping - The vector to append integers to.
1082   /// candidates found in the program.
1083   void findCandidates(std::vector<IRInstructionData *> &InstrList,
1084                       std::vector<unsigned> &IntegerMapping);
1085 
1086 public:
1087   // Find the IRSimilarityCandidates in the \p Modules and group by structural
1088   // similarity in a SimilarityGroup, each group is returned in a
1089   // SimilarityGroupList.
1090   //
1091   // \param [in] Modules - the modules to analyze.
1092   // \returns The groups of similarity ranges found in the modules.
1093   SimilarityGroupList &
1094   findSimilarity(ArrayRef<std::unique_ptr<Module>> Modules);
1095 
1096   // Find the IRSimilarityCandidates in the given Module grouped by structural
1097   // similarity in a SimilarityGroup, contained inside a SimilarityGroupList.
1098   //
1099   // \param [in] M - the module to analyze.
1100   // \returns The groups of similarity ranges found in the module.
1101   SimilarityGroupList &findSimilarity(Module &M);
1102 
1103   // Clears \ref SimilarityCandidates if it is already filled by a previous run.
1104   void resetSimilarityCandidates() {
1105     // If we've already analyzed a Module or set of Modules, so we must clear
1106     // the SimilarityCandidates to make sure we do not have only old values
1107     // hanging around.
1108     if (SimilarityCandidates)
1109       SimilarityCandidates->clear();
1110     else
1111       SimilarityCandidates = SimilarityGroupList();
1112   }
1113 
1114   // \returns The groups of similarity ranges found in the most recently passed
1115   // set of modules.
1116   std::optional<SimilarityGroupList> &getSimilarity() {
1117     return SimilarityCandidates;
1118   }
1119 
1120 private:
1121   /// The allocator for IRInstructionData.
1122   SpecificBumpPtrAllocator<IRInstructionData> InstDataAllocator;
1123 
1124   /// The allocator for IRInstructionDataLists.
1125   SpecificBumpPtrAllocator<IRInstructionDataList> InstDataListAllocator;
1126 
1127   /// Map Instructions to unsigned integers and wraps the Instruction in an
1128   /// instance of IRInstructionData.
1129   IRInstructionMapper Mapper;
1130 
1131   /// The flag variable that marks whether we should check branches for
1132   /// similarity, or only look within basic blocks.
1133   bool EnableBranches = true;
1134 
1135   /// The flag variable that marks whether we allow indirect calls to be checked
1136   /// for similarity, or exclude them as a legal instruction.
1137   bool EnableIndirectCalls = true;
1138 
1139   /// The flag variable that marks whether we allow calls to be marked as
1140   /// similar if they do not have the same name, only the same calling
1141   /// convention, attributes and type signature.
1142   bool EnableMatchingCallsByName = true;
1143 
1144   /// The flag variable that marks whether we should check intrinsics for
1145   /// similarity.
1146   bool EnableIntrinsics = true;
1147 
1148   // The flag variable that marks whether we should allow tailcalls
1149   // to be checked for similarity.
1150   bool EnableMustTailCalls = false;
1151 
1152   /// The SimilarityGroups found with the most recent run of \ref
1153   /// findSimilarity. std::nullopt if there is no recent run.
1154   std::optional<SimilarityGroupList> SimilarityCandidates;
1155 };
1156 
1157 } // end namespace IRSimilarity
1158 
1159 /// An analysis pass based on legacy pass manager that runs and returns
1160 /// IRSimilarityIdentifier run on the Module.
1161 class IRSimilarityIdentifierWrapperPass : public ModulePass {
1162   std::unique_ptr<IRSimilarity::IRSimilarityIdentifier> IRSI;
1163 
1164 public:
1165   static char ID;
1166   IRSimilarityIdentifierWrapperPass();
1167 
1168   IRSimilarity::IRSimilarityIdentifier &getIRSI() { return *IRSI; }
1169   const IRSimilarity::IRSimilarityIdentifier &getIRSI() const { return *IRSI; }
1170 
1171   bool doInitialization(Module &M) override;
1172   bool doFinalization(Module &M) override;
1173   bool runOnModule(Module &M) override;
1174   void getAnalysisUsage(AnalysisUsage &AU) const override {
1175     AU.setPreservesAll();
1176   }
1177 };
1178 
1179 /// An analysis pass that runs and returns the IRSimilarityIdentifier run on the
1180 /// Module.
1181 class IRSimilarityAnalysis : public AnalysisInfoMixin<IRSimilarityAnalysis> {
1182 public:
1183   typedef IRSimilarity::IRSimilarityIdentifier Result;
1184 
1185   Result run(Module &M, ModuleAnalysisManager &);
1186 
1187 private:
1188   friend AnalysisInfoMixin<IRSimilarityAnalysis>;
1189   static AnalysisKey Key;
1190 };
1191 
1192 /// Printer pass that uses \c IRSimilarityAnalysis.
1193 class IRSimilarityAnalysisPrinterPass
1194     : public PassInfoMixin<IRSimilarityAnalysisPrinterPass> {
1195   raw_ostream &OS;
1196 
1197 public:
1198   explicit IRSimilarityAnalysisPrinterPass(raw_ostream &OS) : OS(OS) {}
1199   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
1200   static bool isRequired() { return true; }
1201 };
1202 
1203 } // end namespace llvm
1204 
1205 #endif // LLVM_ANALYSIS_IRSIMILARITYIDENTIFIER_H
1206