xref: /llvm-project/llvm/lib/Transforms/Scalar/EarlyCSE.cpp (revision 17e8d0eae24ffa41cf7641d984c05e00d59b93a4)
1 //===- EarlyCSE.cpp - Simple and fast CSE pass ----------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This pass performs a simple dominator tree walk that eliminates trivially
11 // redundant instructions.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/Transforms/Scalar/EarlyCSE.h"
16 #include "llvm/ADT/Hashing.h"
17 #include "llvm/ADT/ScopedHashTable.h"
18 #include "llvm/ADT/Statistic.h"
19 #include "llvm/Analysis/AssumptionCache.h"
20 #include "llvm/Analysis/GlobalsModRef.h"
21 #include "llvm/Analysis/InstructionSimplify.h"
22 #include "llvm/Analysis/TargetLibraryInfo.h"
23 #include "llvm/Analysis/TargetTransformInfo.h"
24 #include "llvm/IR/DataLayout.h"
25 #include "llvm/IR/Dominators.h"
26 #include "llvm/IR/Instructions.h"
27 #include "llvm/IR/IntrinsicInst.h"
28 #include "llvm/IR/PatternMatch.h"
29 #include "llvm/Pass.h"
30 #include "llvm/Support/Debug.h"
31 #include "llvm/Support/RecyclingAllocator.h"
32 #include "llvm/Support/raw_ostream.h"
33 #include "llvm/Transforms/Scalar.h"
34 #include "llvm/Transforms/Utils/Local.h"
35 #include "llvm/Transforms/Utils/MemorySSA.h"
36 #include "llvm/Transforms/Utils/MemorySSAUpdater.h"
37 #include <deque>
38 using namespace llvm;
39 using namespace llvm::PatternMatch;
40 
41 #define DEBUG_TYPE "early-cse"
42 
43 STATISTIC(NumSimplify, "Number of instructions simplified or DCE'd");
44 STATISTIC(NumCSE,      "Number of instructions CSE'd");
45 STATISTIC(NumCSECVP,   "Number of compare instructions CVP'd");
46 STATISTIC(NumCSELoad,  "Number of load instructions CSE'd");
47 STATISTIC(NumCSECall,  "Number of call instructions CSE'd");
48 STATISTIC(NumDSE,      "Number of trivial dead stores removed");
49 
50 //===----------------------------------------------------------------------===//
51 // SimpleValue
52 //===----------------------------------------------------------------------===//
53 
54 namespace {
55 /// \brief Struct representing the available values in the scoped hash table.
56 struct SimpleValue {
57   Instruction *Inst;
58 
59   SimpleValue(Instruction *I) : Inst(I) {
60     assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
61   }
62 
63   bool isSentinel() const {
64     return Inst == DenseMapInfo<Instruction *>::getEmptyKey() ||
65            Inst == DenseMapInfo<Instruction *>::getTombstoneKey();
66   }
67 
68   static bool canHandle(Instruction *Inst) {
69     // This can only handle non-void readnone functions.
70     if (CallInst *CI = dyn_cast<CallInst>(Inst))
71       return CI->doesNotAccessMemory() && !CI->getType()->isVoidTy();
72     return isa<CastInst>(Inst) || isa<BinaryOperator>(Inst) ||
73            isa<GetElementPtrInst>(Inst) || isa<CmpInst>(Inst) ||
74            isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) ||
75            isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) ||
76            isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst);
77   }
78 };
79 }
80 
81 namespace llvm {
82 template <> struct DenseMapInfo<SimpleValue> {
83   static inline SimpleValue getEmptyKey() {
84     return DenseMapInfo<Instruction *>::getEmptyKey();
85   }
86   static inline SimpleValue getTombstoneKey() {
87     return DenseMapInfo<Instruction *>::getTombstoneKey();
88   }
89   static unsigned getHashValue(SimpleValue Val);
90   static bool isEqual(SimpleValue LHS, SimpleValue RHS);
91 };
92 }
93 
94 unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) {
95   Instruction *Inst = Val.Inst;
96   // Hash in all of the operands as pointers.
97   if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst)) {
98     Value *LHS = BinOp->getOperand(0);
99     Value *RHS = BinOp->getOperand(1);
100     if (BinOp->isCommutative() && BinOp->getOperand(0) > BinOp->getOperand(1))
101       std::swap(LHS, RHS);
102 
103     return hash_combine(BinOp->getOpcode(), LHS, RHS);
104   }
105 
106   if (CmpInst *CI = dyn_cast<CmpInst>(Inst)) {
107     Value *LHS = CI->getOperand(0);
108     Value *RHS = CI->getOperand(1);
109     CmpInst::Predicate Pred = CI->getPredicate();
110     if (Inst->getOperand(0) > Inst->getOperand(1)) {
111       std::swap(LHS, RHS);
112       Pred = CI->getSwappedPredicate();
113     }
114     return hash_combine(Inst->getOpcode(), Pred, LHS, RHS);
115   }
116 
117   if (CastInst *CI = dyn_cast<CastInst>(Inst))
118     return hash_combine(CI->getOpcode(), CI->getType(), CI->getOperand(0));
119 
120   if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(Inst))
121     return hash_combine(EVI->getOpcode(), EVI->getOperand(0),
122                         hash_combine_range(EVI->idx_begin(), EVI->idx_end()));
123 
124   if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(Inst))
125     return hash_combine(IVI->getOpcode(), IVI->getOperand(0),
126                         IVI->getOperand(1),
127                         hash_combine_range(IVI->idx_begin(), IVI->idx_end()));
128 
129   assert((isa<CallInst>(Inst) || isa<BinaryOperator>(Inst) ||
130           isa<GetElementPtrInst>(Inst) || isa<SelectInst>(Inst) ||
131           isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
132           isa<ShuffleVectorInst>(Inst)) &&
133          "Invalid/unknown instruction");
134 
135   // Mix in the opcode.
136   return hash_combine(
137       Inst->getOpcode(),
138       hash_combine_range(Inst->value_op_begin(), Inst->value_op_end()));
139 }
140 
141 bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
142   Instruction *LHSI = LHS.Inst, *RHSI = RHS.Inst;
143 
144   if (LHS.isSentinel() || RHS.isSentinel())
145     return LHSI == RHSI;
146 
147   if (LHSI->getOpcode() != RHSI->getOpcode())
148     return false;
149   if (LHSI->isIdenticalToWhenDefined(RHSI))
150     return true;
151 
152   // If we're not strictly identical, we still might be a commutable instruction
153   if (BinaryOperator *LHSBinOp = dyn_cast<BinaryOperator>(LHSI)) {
154     if (!LHSBinOp->isCommutative())
155       return false;
156 
157     assert(isa<BinaryOperator>(RHSI) &&
158            "same opcode, but different instruction type?");
159     BinaryOperator *RHSBinOp = cast<BinaryOperator>(RHSI);
160 
161     // Commuted equality
162     return LHSBinOp->getOperand(0) == RHSBinOp->getOperand(1) &&
163            LHSBinOp->getOperand(1) == RHSBinOp->getOperand(0);
164   }
165   if (CmpInst *LHSCmp = dyn_cast<CmpInst>(LHSI)) {
166     assert(isa<CmpInst>(RHSI) &&
167            "same opcode, but different instruction type?");
168     CmpInst *RHSCmp = cast<CmpInst>(RHSI);
169     // Commuted equality
170     return LHSCmp->getOperand(0) == RHSCmp->getOperand(1) &&
171            LHSCmp->getOperand(1) == RHSCmp->getOperand(0) &&
172            LHSCmp->getSwappedPredicate() == RHSCmp->getPredicate();
173   }
174 
175   return false;
176 }
177 
178 //===----------------------------------------------------------------------===//
179 // CallValue
180 //===----------------------------------------------------------------------===//
181 
182 namespace {
183 /// \brief Struct representing the available call values in the scoped hash
184 /// table.
185 struct CallValue {
186   Instruction *Inst;
187 
188   CallValue(Instruction *I) : Inst(I) {
189     assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
190   }
191 
192   bool isSentinel() const {
193     return Inst == DenseMapInfo<Instruction *>::getEmptyKey() ||
194            Inst == DenseMapInfo<Instruction *>::getTombstoneKey();
195   }
196 
197   static bool canHandle(Instruction *Inst) {
198     // Don't value number anything that returns void.
199     if (Inst->getType()->isVoidTy())
200       return false;
201 
202     CallInst *CI = dyn_cast<CallInst>(Inst);
203     if (!CI || !CI->onlyReadsMemory())
204       return false;
205     return true;
206   }
207 };
208 }
209 
210 namespace llvm {
211 template <> struct DenseMapInfo<CallValue> {
212   static inline CallValue getEmptyKey() {
213     return DenseMapInfo<Instruction *>::getEmptyKey();
214   }
215   static inline CallValue getTombstoneKey() {
216     return DenseMapInfo<Instruction *>::getTombstoneKey();
217   }
218   static unsigned getHashValue(CallValue Val);
219   static bool isEqual(CallValue LHS, CallValue RHS);
220 };
221 }
222 
223 unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) {
224   Instruction *Inst = Val.Inst;
225   // Hash all of the operands as pointers and mix in the opcode.
226   return hash_combine(
227       Inst->getOpcode(),
228       hash_combine_range(Inst->value_op_begin(), Inst->value_op_end()));
229 }
230 
231 bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) {
232   Instruction *LHSI = LHS.Inst, *RHSI = RHS.Inst;
233   if (LHS.isSentinel() || RHS.isSentinel())
234     return LHSI == RHSI;
235   return LHSI->isIdenticalTo(RHSI);
236 }
237 
238 //===----------------------------------------------------------------------===//
239 // EarlyCSE implementation
240 //===----------------------------------------------------------------------===//
241 
242 namespace {
243 /// \brief A simple and fast domtree-based CSE pass.
244 ///
245 /// This pass does a simple depth-first walk over the dominator tree,
246 /// eliminating trivially redundant instructions and using instsimplify to
247 /// canonicalize things as it goes. It is intended to be fast and catch obvious
248 /// cases so that instcombine and other passes are more effective. It is
249 /// expected that a later pass of GVN will catch the interesting/hard cases.
250 class EarlyCSE {
251 public:
252   const TargetLibraryInfo &TLI;
253   const TargetTransformInfo &TTI;
254   DominatorTree &DT;
255   AssumptionCache &AC;
256   MemorySSA *MSSA;
257   std::unique_ptr<MemorySSAUpdater> MSSAUpdater;
258   typedef RecyclingAllocator<
259       BumpPtrAllocator, ScopedHashTableVal<SimpleValue, Value *>> AllocatorTy;
260   typedef ScopedHashTable<SimpleValue, Value *, DenseMapInfo<SimpleValue>,
261                           AllocatorTy> ScopedHTType;
262 
263   /// \brief A scoped hash table of the current values of all of our simple
264   /// scalar expressions.
265   ///
266   /// As we walk down the domtree, we look to see if instructions are in this:
267   /// if so, we replace them with what we find, otherwise we insert them so
268   /// that dominated values can succeed in their lookup.
269   ScopedHTType AvailableValues;
270 
271   /// A scoped hash table of the current values of previously encounted memory
272   /// locations.
273   ///
274   /// This allows us to get efficient access to dominating loads or stores when
275   /// we have a fully redundant load.  In addition to the most recent load, we
276   /// keep track of a generation count of the read, which is compared against
277   /// the current generation count.  The current generation count is incremented
278   /// after every possibly writing memory operation, which ensures that we only
279   /// CSE loads with other loads that have no intervening store.  Ordering
280   /// events (such as fences or atomic instructions) increment the generation
281   /// count as well; essentially, we model these as writes to all possible
282   /// locations.  Note that atomic and/or volatile loads and stores can be
283   /// present the table; it is the responsibility of the consumer to inspect
284   /// the atomicity/volatility if needed.
285   struct LoadValue {
286     Instruction *DefInst;
287     unsigned Generation;
288     int MatchingId;
289     bool IsAtomic;
290     bool IsInvariant;
291     LoadValue()
292         : DefInst(nullptr), Generation(0), MatchingId(-1), IsAtomic(false),
293           IsInvariant(false) {}
294     LoadValue(Instruction *Inst, unsigned Generation, unsigned MatchingId,
295               bool IsAtomic, bool IsInvariant)
296         : DefInst(Inst), Generation(Generation), MatchingId(MatchingId),
297           IsAtomic(IsAtomic), IsInvariant(IsInvariant) {}
298   };
299   typedef RecyclingAllocator<BumpPtrAllocator,
300                              ScopedHashTableVal<Value *, LoadValue>>
301       LoadMapAllocator;
302   typedef ScopedHashTable<Value *, LoadValue, DenseMapInfo<Value *>,
303                           LoadMapAllocator> LoadHTType;
304   LoadHTType AvailableLoads;
305 
306   /// \brief A scoped hash table of the current values of read-only call
307   /// values.
308   ///
309   /// It uses the same generation count as loads.
310   typedef ScopedHashTable<CallValue, std::pair<Instruction *, unsigned>>
311       CallHTType;
312   CallHTType AvailableCalls;
313 
314   /// \brief This is the current generation of the memory value.
315   unsigned CurrentGeneration;
316 
317   /// \brief Set up the EarlyCSE runner for a particular function.
318   EarlyCSE(const TargetLibraryInfo &TLI, const TargetTransformInfo &TTI,
319            DominatorTree &DT, AssumptionCache &AC, MemorySSA *MSSA)
320       : TLI(TLI), TTI(TTI), DT(DT), AC(AC), MSSA(MSSA),
321         MSSAUpdater(make_unique<MemorySSAUpdater>(MSSA)), CurrentGeneration(0) {
322   }
323 
324   bool run();
325 
326 private:
327   // Almost a POD, but needs to call the constructors for the scoped hash
328   // tables so that a new scope gets pushed on. These are RAII so that the
329   // scope gets popped when the NodeScope is destroyed.
330   class NodeScope {
331   public:
332     NodeScope(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
333               CallHTType &AvailableCalls)
334         : Scope(AvailableValues), LoadScope(AvailableLoads),
335           CallScope(AvailableCalls) {}
336 
337   private:
338     NodeScope(const NodeScope &) = delete;
339     void operator=(const NodeScope &) = delete;
340 
341     ScopedHTType::ScopeTy Scope;
342     LoadHTType::ScopeTy LoadScope;
343     CallHTType::ScopeTy CallScope;
344   };
345 
346   // Contains all the needed information to create a stack for doing a depth
347   // first traversal of the tree. This includes scopes for values, loads, and
348   // calls as well as the generation. There is a child iterator so that the
349   // children do not need to be store separately.
350   class StackNode {
351   public:
352     StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
353               CallHTType &AvailableCalls, unsigned cg, DomTreeNode *n,
354               DomTreeNode::iterator child, DomTreeNode::iterator end)
355         : CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child),
356           EndIter(end), Scopes(AvailableValues, AvailableLoads, AvailableCalls),
357           Processed(false) {}
358 
359     // Accessors.
360     unsigned currentGeneration() { return CurrentGeneration; }
361     unsigned childGeneration() { return ChildGeneration; }
362     void childGeneration(unsigned generation) { ChildGeneration = generation; }
363     DomTreeNode *node() { return Node; }
364     DomTreeNode::iterator childIter() { return ChildIter; }
365     DomTreeNode *nextChild() {
366       DomTreeNode *child = *ChildIter;
367       ++ChildIter;
368       return child;
369     }
370     DomTreeNode::iterator end() { return EndIter; }
371     bool isProcessed() { return Processed; }
372     void process() { Processed = true; }
373 
374   private:
375     StackNode(const StackNode &) = delete;
376     void operator=(const StackNode &) = delete;
377 
378     // Members.
379     unsigned CurrentGeneration;
380     unsigned ChildGeneration;
381     DomTreeNode *Node;
382     DomTreeNode::iterator ChildIter;
383     DomTreeNode::iterator EndIter;
384     NodeScope Scopes;
385     bool Processed;
386   };
387 
388   /// \brief Wrapper class to handle memory instructions, including loads,
389   /// stores and intrinsic loads and stores defined by the target.
390   class ParseMemoryInst {
391   public:
392     ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI)
393       : IsTargetMemInst(false), Inst(Inst) {
394       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst))
395         if (TTI.getTgtMemIntrinsic(II, Info) && Info.NumMemRefs == 1)
396           IsTargetMemInst = true;
397     }
398     bool isLoad() const {
399       if (IsTargetMemInst) return Info.ReadMem;
400       return isa<LoadInst>(Inst);
401     }
402     bool isStore() const {
403       if (IsTargetMemInst) return Info.WriteMem;
404       return isa<StoreInst>(Inst);
405     }
406     bool isAtomic() const {
407       if (IsTargetMemInst) {
408         assert(Info.IsSimple && "need to refine IsSimple in TTI");
409         return false;
410       }
411       return Inst->isAtomic();
412     }
413     bool isUnordered() const {
414       if (IsTargetMemInst) {
415         assert(Info.IsSimple && "need to refine IsSimple in TTI");
416         return true;
417       }
418       if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
419         return LI->isUnordered();
420       } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
421         return SI->isUnordered();
422       }
423       // Conservative answer
424       return !Inst->isAtomic();
425     }
426 
427     bool isVolatile() const {
428       if (IsTargetMemInst) {
429         assert(Info.IsSimple && "need to refine IsSimple in TTI");
430         return false;
431       }
432       if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
433         return LI->isVolatile();
434       } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
435         return SI->isVolatile();
436       }
437       // Conservative answer
438       return true;
439     }
440 
441     bool isInvariantLoad() const {
442       if (auto *LI = dyn_cast<LoadInst>(Inst))
443         return LI->getMetadata(LLVMContext::MD_invariant_load) != nullptr;
444       return false;
445     }
446 
447     bool isMatchingMemLoc(const ParseMemoryInst &Inst) const {
448       return (getPointerOperand() == Inst.getPointerOperand() &&
449               getMatchingId() == Inst.getMatchingId());
450     }
451     bool isValid() const { return getPointerOperand() != nullptr; }
452 
453     // For regular (non-intrinsic) loads/stores, this is set to -1. For
454     // intrinsic loads/stores, the id is retrieved from the corresponding
455     // field in the MemIntrinsicInfo structure.  That field contains
456     // non-negative values only.
457     int getMatchingId() const {
458       if (IsTargetMemInst) return Info.MatchingId;
459       return -1;
460     }
461     Value *getPointerOperand() const {
462       if (IsTargetMemInst) return Info.PtrVal;
463       if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
464         return LI->getPointerOperand();
465       } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
466         return SI->getPointerOperand();
467       }
468       return nullptr;
469     }
470     bool mayReadFromMemory() const {
471       if (IsTargetMemInst) return Info.ReadMem;
472       return Inst->mayReadFromMemory();
473     }
474     bool mayWriteToMemory() const {
475       if (IsTargetMemInst) return Info.WriteMem;
476       return Inst->mayWriteToMemory();
477     }
478 
479   private:
480     bool IsTargetMemInst;
481     MemIntrinsicInfo Info;
482     Instruction *Inst;
483   };
484 
485   bool processNode(DomTreeNode *Node);
486 
487   Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const {
488     if (auto *LI = dyn_cast<LoadInst>(Inst))
489       return LI;
490     if (auto *SI = dyn_cast<StoreInst>(Inst))
491       return SI->getValueOperand();
492     assert(isa<IntrinsicInst>(Inst) && "Instruction not supported");
493     return TTI.getOrCreateResultFromMemIntrinsic(cast<IntrinsicInst>(Inst),
494                                                  ExpectedType);
495   }
496 
497   bool isSameMemGeneration(unsigned EarlierGeneration, unsigned LaterGeneration,
498                            Instruction *EarlierInst, Instruction *LaterInst);
499 
500   void removeMSSA(Instruction *Inst) {
501     if (!MSSA)
502       return;
503     // Removing a store here can leave MemorySSA in an unoptimized state by
504     // creating MemoryPhis that have identical arguments and by creating
505     // MemoryUses whose defining access is not an actual clobber.  We handle the
506     // phi case eagerly here.  The non-optimized MemoryUse case is lazily
507     // updated by MemorySSA getClobberingMemoryAccess.
508     if (MemoryAccess *MA = MSSA->getMemoryAccess(Inst)) {
509       // Optimize MemoryPhi nodes that may become redundant by having all the
510       // same input values once MA is removed.
511       SmallVector<MemoryPhi *, 4> PhisToCheck;
512       SmallVector<MemoryAccess *, 8> WorkQueue;
513       WorkQueue.push_back(MA);
514       // Process MemoryPhi nodes in FIFO order using a ever-growing vector since
515       // we shouldn't be processing that many phis and this will avoid an
516       // allocation in almost all cases.
517       for (unsigned I = 0; I < WorkQueue.size(); ++I) {
518         MemoryAccess *WI = WorkQueue[I];
519 
520         for (auto *U : WI->users())
521           if (MemoryPhi *MP = dyn_cast<MemoryPhi>(U))
522             PhisToCheck.push_back(MP);
523 
524         MSSAUpdater->removeMemoryAccess(WI);
525 
526         for (MemoryPhi *MP : PhisToCheck) {
527           MemoryAccess *FirstIn = MP->getIncomingValue(0);
528           if (all_of(MP->incoming_values(),
529                      [=](Use &In) { return In == FirstIn; }))
530             WorkQueue.push_back(MP);
531         }
532         PhisToCheck.clear();
533       }
534     }
535   }
536 };
537 }
538 
539 /// Determine if the memory referenced by LaterInst is from the same heap
540 /// version as EarlierInst.
541 /// This is currently called in two scenarios:
542 ///
543 ///   load p
544 ///   ...
545 ///   load p
546 ///
547 /// and
548 ///
549 ///   x = load p
550 ///   ...
551 ///   store x, p
552 ///
553 /// in both cases we want to verify that there are no possible writes to the
554 /// memory referenced by p between the earlier and later instruction.
555 bool EarlyCSE::isSameMemGeneration(unsigned EarlierGeneration,
556                                    unsigned LaterGeneration,
557                                    Instruction *EarlierInst,
558                                    Instruction *LaterInst) {
559   // Check the simple memory generation tracking first.
560   if (EarlierGeneration == LaterGeneration)
561     return true;
562 
563   if (!MSSA)
564     return false;
565 
566   // Since we know LaterDef dominates LaterInst and EarlierInst dominates
567   // LaterInst, if LaterDef dominates EarlierInst then it can't occur between
568   // EarlierInst and LaterInst and neither can any other write that potentially
569   // clobbers LaterInst.
570   MemoryAccess *LaterDef =
571       MSSA->getWalker()->getClobberingMemoryAccess(LaterInst);
572   return MSSA->dominates(LaterDef, MSSA->getMemoryAccess(EarlierInst));
573 }
574 
575 bool EarlyCSE::processNode(DomTreeNode *Node) {
576   bool Changed = false;
577   BasicBlock *BB = Node->getBlock();
578 
579   // If this block has a single predecessor, then the predecessor is the parent
580   // of the domtree node and all of the live out memory values are still current
581   // in this block.  If this block has multiple predecessors, then they could
582   // have invalidated the live-out memory values of our parent value.  For now,
583   // just be conservative and invalidate memory if this block has multiple
584   // predecessors.
585   if (!BB->getSinglePredecessor())
586     ++CurrentGeneration;
587 
588   // If this node has a single predecessor which ends in a conditional branch,
589   // we can infer the value of the branch condition given that we took this
590   // path.  We need the single predecessor to ensure there's not another path
591   // which reaches this block where the condition might hold a different
592   // value.  Since we're adding this to the scoped hash table (like any other
593   // def), it will have been popped if we encounter a future merge block.
594   if (BasicBlock *Pred = BB->getSinglePredecessor())
595     if (auto *BI = dyn_cast<BranchInst>(Pred->getTerminator()))
596       if (BI->isConditional())
597         if (auto *CondInst = dyn_cast<Instruction>(BI->getCondition()))
598           if (SimpleValue::canHandle(CondInst)) {
599             assert(BI->getSuccessor(0) == BB || BI->getSuccessor(1) == BB);
600             auto *ConditionalConstant = (BI->getSuccessor(0) == BB) ?
601               ConstantInt::getTrue(BB->getContext()) :
602               ConstantInt::getFalse(BB->getContext());
603             AvailableValues.insert(CondInst, ConditionalConstant);
604             DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '"
605                   << CondInst->getName() << "' as " << *ConditionalConstant
606                   << " in " << BB->getName() << "\n");
607             // Replace all dominated uses with the known value.
608             if (unsigned Count =
609                     replaceDominatedUsesWith(CondInst, ConditionalConstant, DT,
610                                              BasicBlockEdge(Pred, BB))) {
611               Changed = true;
612               NumCSECVP = NumCSECVP + Count;
613             }
614           }
615 
616   /// LastStore - Keep track of the last non-volatile store that we saw... for
617   /// as long as there in no instruction that reads memory.  If we see a store
618   /// to the same location, we delete the dead store.  This zaps trivial dead
619   /// stores which can occur in bitfield code among other things.
620   Instruction *LastStore = nullptr;
621 
622   const DataLayout &DL = BB->getModule()->getDataLayout();
623 
624   // See if any instructions in the block can be eliminated.  If so, do it.  If
625   // not, add them to AvailableValues.
626   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
627     Instruction *Inst = &*I++;
628 
629     // Dead instructions should just be removed.
630     if (isInstructionTriviallyDead(Inst, &TLI)) {
631       DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n');
632       removeMSSA(Inst);
633       Inst->eraseFromParent();
634       Changed = true;
635       ++NumSimplify;
636       continue;
637     }
638 
639     // Skip assume intrinsics, they don't really have side effects (although
640     // they're marked as such to ensure preservation of control dependencies),
641     // and this pass will not disturb any of the assumption's control
642     // dependencies.
643     if (match(Inst, m_Intrinsic<Intrinsic::assume>())) {
644       DEBUG(dbgs() << "EarlyCSE skipping assumption: " << *Inst << '\n');
645       continue;
646     }
647 
648     // Skip invariant.start intrinsics since they only read memory, and we can
649     // forward values across it. Also, we dont need to consume the last store
650     // since the semantics of invariant.start allow us to perform DSE of the
651     // last store, if there was a store following invariant.start. Consider:
652     //
653     // store 30, i8* p
654     // invariant.start(p)
655     // store 40, i8* p
656     // We can DSE the store to 30, since the store 40 to invariant location p
657     // causes undefined behaviour.
658     if (match(Inst, m_Intrinsic<Intrinsic::invariant_start>()))
659       continue;
660 
661     if (match(Inst, m_Intrinsic<Intrinsic::experimental_guard>())) {
662       if (auto *CondI =
663               dyn_cast<Instruction>(cast<CallInst>(Inst)->getArgOperand(0))) {
664         // The condition we're on guarding here is true for all dominated
665         // locations.
666         if (SimpleValue::canHandle(CondI))
667           AvailableValues.insert(CondI, ConstantInt::getTrue(BB->getContext()));
668       }
669 
670       // Guard intrinsics read all memory, but don't write any memory.
671       // Accordingly, don't update the generation but consume the last store (to
672       // avoid an incorrect DSE).
673       LastStore = nullptr;
674       continue;
675     }
676 
677     // If the instruction can be simplified (e.g. X+0 = X) then replace it with
678     // its simpler value.
679     if (Value *V = SimplifyInstruction(Inst, DL, &TLI, &DT, &AC)) {
680       DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << "  to: " << *V << '\n');
681       bool Killed = false;
682       if (!Inst->use_empty()) {
683         Inst->replaceAllUsesWith(V);
684         Changed = true;
685       }
686       if (isInstructionTriviallyDead(Inst, &TLI)) {
687         removeMSSA(Inst);
688         Inst->eraseFromParent();
689         Changed = true;
690         Killed = true;
691       }
692       if (Changed)
693         ++NumSimplify;
694       if (Killed)
695         continue;
696     }
697 
698     // If this is a simple instruction that we can value number, process it.
699     if (SimpleValue::canHandle(Inst)) {
700       // See if the instruction has an available value.  If so, use it.
701       if (Value *V = AvailableValues.lookup(Inst)) {
702         DEBUG(dbgs() << "EarlyCSE CSE: " << *Inst << "  to: " << *V << '\n');
703         if (auto *I = dyn_cast<Instruction>(V))
704           I->andIRFlags(Inst);
705         Inst->replaceAllUsesWith(V);
706         removeMSSA(Inst);
707         Inst->eraseFromParent();
708         Changed = true;
709         ++NumCSE;
710         continue;
711       }
712 
713       // Otherwise, just remember that this value is available.
714       AvailableValues.insert(Inst, Inst);
715       continue;
716     }
717 
718     ParseMemoryInst MemInst(Inst, TTI);
719     // If this is a non-volatile load, process it.
720     if (MemInst.isValid() && MemInst.isLoad()) {
721       // (conservatively) we can't peak past the ordering implied by this
722       // operation, but we can add this load to our set of available values
723       if (MemInst.isVolatile() || !MemInst.isUnordered()) {
724         LastStore = nullptr;
725         ++CurrentGeneration;
726       }
727 
728       // If we have an available version of this load, and if it is the right
729       // generation or the load is known to be from an invariant location,
730       // replace this instruction.
731       //
732       // If either the dominating load or the current load are invariant, then
733       // we can assume the current load loads the same value as the dominating
734       // load.
735       LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
736       if (InVal.DefInst != nullptr &&
737           InVal.MatchingId == MemInst.getMatchingId() &&
738           // We don't yet handle removing loads with ordering of any kind.
739           !MemInst.isVolatile() && MemInst.isUnordered() &&
740           // We can't replace an atomic load with one which isn't also atomic.
741           InVal.IsAtomic >= MemInst.isAtomic() &&
742           (InVal.IsInvariant || MemInst.isInvariantLoad() ||
743            isSameMemGeneration(InVal.Generation, CurrentGeneration,
744                                InVal.DefInst, Inst))) {
745         Value *Op = getOrCreateResult(InVal.DefInst, Inst->getType());
746         if (Op != nullptr) {
747           DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst
748                        << "  to: " << *InVal.DefInst << '\n');
749           if (!Inst->use_empty())
750             Inst->replaceAllUsesWith(Op);
751           removeMSSA(Inst);
752           Inst->eraseFromParent();
753           Changed = true;
754           ++NumCSELoad;
755           continue;
756         }
757       }
758 
759       // Otherwise, remember that we have this instruction.
760       AvailableLoads.insert(
761           MemInst.getPointerOperand(),
762           LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(),
763                     MemInst.isAtomic(), MemInst.isInvariantLoad()));
764       LastStore = nullptr;
765       continue;
766     }
767 
768     // If this instruction may read from memory or throw (and potentially read
769     // from memory in the exception handler), forget LastStore.  Load/store
770     // intrinsics will indicate both a read and a write to memory.  The target
771     // may override this (e.g. so that a store intrinsic does not read from
772     // memory, and thus will be treated the same as a regular store for
773     // commoning purposes).
774     if ((Inst->mayReadFromMemory() || Inst->mayThrow()) &&
775         !(MemInst.isValid() && !MemInst.mayReadFromMemory()))
776       LastStore = nullptr;
777 
778     // If this is a read-only call, process it.
779     if (CallValue::canHandle(Inst)) {
780       // If we have an available version of this call, and if it is the right
781       // generation, replace this instruction.
782       std::pair<Instruction *, unsigned> InVal = AvailableCalls.lookup(Inst);
783       if (InVal.first != nullptr &&
784           isSameMemGeneration(InVal.second, CurrentGeneration, InVal.first,
785                               Inst)) {
786         DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst
787                      << "  to: " << *InVal.first << '\n');
788         if (!Inst->use_empty())
789           Inst->replaceAllUsesWith(InVal.first);
790         removeMSSA(Inst);
791         Inst->eraseFromParent();
792         Changed = true;
793         ++NumCSECall;
794         continue;
795       }
796 
797       // Otherwise, remember that we have this instruction.
798       AvailableCalls.insert(
799           Inst, std::pair<Instruction *, unsigned>(Inst, CurrentGeneration));
800       continue;
801     }
802 
803     // A release fence requires that all stores complete before it, but does
804     // not prevent the reordering of following loads 'before' the fence.  As a
805     // result, we don't need to consider it as writing to memory and don't need
806     // to advance the generation.  We do need to prevent DSE across the fence,
807     // but that's handled above.
808     if (FenceInst *FI = dyn_cast<FenceInst>(Inst))
809       if (FI->getOrdering() == AtomicOrdering::Release) {
810         assert(Inst->mayReadFromMemory() && "relied on to prevent DSE above");
811         continue;
812       }
813 
814     // write back DSE - If we write back the same value we just loaded from
815     // the same location and haven't passed any intervening writes or ordering
816     // operations, we can remove the write.  The primary benefit is in allowing
817     // the available load table to remain valid and value forward past where
818     // the store originally was.
819     if (MemInst.isValid() && MemInst.isStore()) {
820       LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
821       if (InVal.DefInst &&
822           InVal.DefInst == getOrCreateResult(Inst, InVal.DefInst->getType()) &&
823           InVal.MatchingId == MemInst.getMatchingId() &&
824           // We don't yet handle removing stores with ordering of any kind.
825           !MemInst.isVolatile() && MemInst.isUnordered() &&
826           isSameMemGeneration(InVal.Generation, CurrentGeneration,
827                               InVal.DefInst, Inst)) {
828         // It is okay to have a LastStore to a different pointer here if MemorySSA
829         // tells us that the load and store are from the same memory generation.
830         // In that case, LastStore should keep its present value since we're
831         // removing the current store.
832         assert((!LastStore ||
833                 ParseMemoryInst(LastStore, TTI).getPointerOperand() ==
834                     MemInst.getPointerOperand() ||
835                 MSSA) &&
836                "can't have an intervening store if not using MemorySSA!");
837         DEBUG(dbgs() << "EarlyCSE DSE (writeback): " << *Inst << '\n');
838         removeMSSA(Inst);
839         Inst->eraseFromParent();
840         Changed = true;
841         ++NumDSE;
842         // We can avoid incrementing the generation count since we were able
843         // to eliminate this store.
844         continue;
845       }
846     }
847 
848     // Okay, this isn't something we can CSE at all.  Check to see if it is
849     // something that could modify memory.  If so, our available memory values
850     // cannot be used so bump the generation count.
851     if (Inst->mayWriteToMemory()) {
852       ++CurrentGeneration;
853 
854       if (MemInst.isValid() && MemInst.isStore()) {
855         // We do a trivial form of DSE if there are two stores to the same
856         // location with no intervening loads.  Delete the earlier store.
857         // At the moment, we don't remove ordered stores, but do remove
858         // unordered atomic stores.  There's no special requirement (for
859         // unordered atomics) about removing atomic stores only in favor of
860         // other atomic stores since we we're going to execute the non-atomic
861         // one anyway and the atomic one might never have become visible.
862         if (LastStore) {
863           ParseMemoryInst LastStoreMemInst(LastStore, TTI);
864           assert(LastStoreMemInst.isUnordered() &&
865                  !LastStoreMemInst.isVolatile() &&
866                  "Violated invariant");
867           if (LastStoreMemInst.isMatchingMemLoc(MemInst)) {
868             DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
869                          << "  due to: " << *Inst << '\n');
870             removeMSSA(LastStore);
871             LastStore->eraseFromParent();
872             Changed = true;
873             ++NumDSE;
874             LastStore = nullptr;
875           }
876           // fallthrough - we can exploit information about this store
877         }
878 
879         // Okay, we just invalidated anything we knew about loaded values.  Try
880         // to salvage *something* by remembering that the stored value is a live
881         // version of the pointer.  It is safe to forward from volatile stores
882         // to non-volatile loads, so we don't have to check for volatility of
883         // the store.
884         AvailableLoads.insert(
885             MemInst.getPointerOperand(),
886             LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(),
887                       MemInst.isAtomic(), /*IsInvariant=*/false));
888 
889         // Remember that this was the last unordered store we saw for DSE. We
890         // don't yet handle DSE on ordered or volatile stores since we don't
891         // have a good way to model the ordering requirement for following
892         // passes  once the store is removed.  We could insert a fence, but
893         // since fences are slightly stronger than stores in their ordering,
894         // it's not clear this is a profitable transform. Another option would
895         // be to merge the ordering with that of the post dominating store.
896         if (MemInst.isUnordered() && !MemInst.isVolatile())
897           LastStore = Inst;
898         else
899           LastStore = nullptr;
900       }
901     }
902   }
903 
904   return Changed;
905 }
906 
907 bool EarlyCSE::run() {
908   // Note, deque is being used here because there is significant performance
909   // gains over vector when the container becomes very large due to the
910   // specific access patterns. For more information see the mailing list
911   // discussion on this:
912   // http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20120116/135228.html
913   std::deque<StackNode *> nodesToProcess;
914 
915   bool Changed = false;
916 
917   // Process the root node.
918   nodesToProcess.push_back(new StackNode(
919       AvailableValues, AvailableLoads, AvailableCalls, CurrentGeneration,
920       DT.getRootNode(), DT.getRootNode()->begin(), DT.getRootNode()->end()));
921 
922   // Save the current generation.
923   unsigned LiveOutGeneration = CurrentGeneration;
924 
925   // Process the stack.
926   while (!nodesToProcess.empty()) {
927     // Grab the first item off the stack. Set the current generation, remove
928     // the node from the stack, and process it.
929     StackNode *NodeToProcess = nodesToProcess.back();
930 
931     // Initialize class members.
932     CurrentGeneration = NodeToProcess->currentGeneration();
933 
934     // Check if the node needs to be processed.
935     if (!NodeToProcess->isProcessed()) {
936       // Process the node.
937       Changed |= processNode(NodeToProcess->node());
938       NodeToProcess->childGeneration(CurrentGeneration);
939       NodeToProcess->process();
940     } else if (NodeToProcess->childIter() != NodeToProcess->end()) {
941       // Push the next child onto the stack.
942       DomTreeNode *child = NodeToProcess->nextChild();
943       nodesToProcess.push_back(
944           new StackNode(AvailableValues, AvailableLoads, AvailableCalls,
945                         NodeToProcess->childGeneration(), child, child->begin(),
946                         child->end()));
947     } else {
948       // It has been processed, and there are no more children to process,
949       // so delete it and pop it off the stack.
950       delete NodeToProcess;
951       nodesToProcess.pop_back();
952     }
953   } // while (!nodes...)
954 
955   // Reset the current generation.
956   CurrentGeneration = LiveOutGeneration;
957 
958   return Changed;
959 }
960 
961 PreservedAnalyses EarlyCSEPass::run(Function &F,
962                                     FunctionAnalysisManager &AM) {
963   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
964   auto &TTI = AM.getResult<TargetIRAnalysis>(F);
965   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
966   auto &AC = AM.getResult<AssumptionAnalysis>(F);
967   auto *MSSA =
968       UseMemorySSA ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() : nullptr;
969 
970   EarlyCSE CSE(TLI, TTI, DT, AC, MSSA);
971 
972   if (!CSE.run())
973     return PreservedAnalyses::all();
974 
975   PreservedAnalyses PA;
976   PA.preserveSet<CFGAnalyses>();
977   PA.preserve<GlobalsAA>();
978   if (UseMemorySSA)
979     PA.preserve<MemorySSAAnalysis>();
980   return PA;
981 }
982 
983 namespace {
984 /// \brief A simple and fast domtree-based CSE pass.
985 ///
986 /// This pass does a simple depth-first walk over the dominator tree,
987 /// eliminating trivially redundant instructions and using instsimplify to
988 /// canonicalize things as it goes. It is intended to be fast and catch obvious
989 /// cases so that instcombine and other passes are more effective. It is
990 /// expected that a later pass of GVN will catch the interesting/hard cases.
991 template<bool UseMemorySSA>
992 class EarlyCSELegacyCommonPass : public FunctionPass {
993 public:
994   static char ID;
995 
996   EarlyCSELegacyCommonPass() : FunctionPass(ID) {
997     if (UseMemorySSA)
998       initializeEarlyCSEMemSSALegacyPassPass(*PassRegistry::getPassRegistry());
999     else
1000       initializeEarlyCSELegacyPassPass(*PassRegistry::getPassRegistry());
1001   }
1002 
1003   bool runOnFunction(Function &F) override {
1004     if (skipFunction(F))
1005       return false;
1006 
1007     auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
1008     auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1009     auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1010     auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1011     auto *MSSA =
1012         UseMemorySSA ? &getAnalysis<MemorySSAWrapperPass>().getMSSA() : nullptr;
1013 
1014     EarlyCSE CSE(TLI, TTI, DT, AC, MSSA);
1015 
1016     return CSE.run();
1017   }
1018 
1019   void getAnalysisUsage(AnalysisUsage &AU) const override {
1020     AU.addRequired<AssumptionCacheTracker>();
1021     AU.addRequired<DominatorTreeWrapperPass>();
1022     AU.addRequired<TargetLibraryInfoWrapperPass>();
1023     AU.addRequired<TargetTransformInfoWrapperPass>();
1024     if (UseMemorySSA) {
1025       AU.addRequired<MemorySSAWrapperPass>();
1026       AU.addPreserved<MemorySSAWrapperPass>();
1027     }
1028     AU.addPreserved<GlobalsAAWrapperPass>();
1029     AU.setPreservesCFG();
1030   }
1031 };
1032 }
1033 
1034 using EarlyCSELegacyPass = EarlyCSELegacyCommonPass</*UseMemorySSA=*/false>;
1035 
1036 template<>
1037 char EarlyCSELegacyPass::ID = 0;
1038 
1039 INITIALIZE_PASS_BEGIN(EarlyCSELegacyPass, "early-cse", "Early CSE", false,
1040                       false)
1041 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1042 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1043 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
1044 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
1045 INITIALIZE_PASS_END(EarlyCSELegacyPass, "early-cse", "Early CSE", false, false)
1046 
1047 using EarlyCSEMemSSALegacyPass =
1048     EarlyCSELegacyCommonPass</*UseMemorySSA=*/true>;
1049 
1050 template<>
1051 char EarlyCSEMemSSALegacyPass::ID = 0;
1052 
1053 FunctionPass *llvm::createEarlyCSEPass(bool UseMemorySSA) {
1054   if (UseMemorySSA)
1055     return new EarlyCSEMemSSALegacyPass();
1056   else
1057     return new EarlyCSELegacyPass();
1058 }
1059 
1060 INITIALIZE_PASS_BEGIN(EarlyCSEMemSSALegacyPass, "early-cse-memssa",
1061                       "Early CSE w/ MemorySSA", false, false)
1062 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
1063 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
1064 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
1065 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
1066 INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
1067 INITIALIZE_PASS_END(EarlyCSEMemSSALegacyPass, "early-cse-memssa",
1068                     "Early CSE w/ MemorySSA", false, false)
1069