xref: /llvm-project/llvm/lib/Transforms/Scalar/LoopDistribute.cpp (revision d2fa41471828f9a1c022db095807110b6cfd6dec)
1 //===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements the Loop Distribution Pass.  Its main focus is to
11 // distribute loops that cannot be vectorized due to dependence cycles.  It
12 // tries to isolate the offending dependences into a new loop allowing
13 // vectorization of the remaining parts.
14 //
15 // For dependence analysis, the pass uses the LoopVectorizer's
16 // LoopAccessAnalysis.  Because this analysis presumes no change in the order of
17 // memory operations, special care is taken to preserve the lexical order of
18 // these operations.
19 //
20 // Similarly to the Vectorizer, the pass also supports loop versioning to
21 // run-time disambiguate potentially overlapping arrays.
22 //
23 //===----------------------------------------------------------------------===//
24 
25 #include "llvm/ADT/DepthFirstIterator.h"
26 #include "llvm/ADT/EquivalenceClasses.h"
27 #include "llvm/ADT/STLExtras.h"
28 #include "llvm/ADT/Statistic.h"
29 #include "llvm/Analysis/LoopAccessAnalysis.h"
30 #include "llvm/Analysis/LoopInfo.h"
31 #include "llvm/IR/Dominators.h"
32 #include "llvm/Pass.h"
33 #include "llvm/Support/CommandLine.h"
34 #include "llvm/Support/Debug.h"
35 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
36 #include "llvm/Transforms/Utils/Cloning.h"
37 #include "llvm/Transforms/Utils/LoopUtils.h"
38 #include "llvm/Transforms/Utils/LoopVersioning.h"
39 #include <list>
40 
41 #define LDIST_NAME "loop-distribute"
42 #define DEBUG_TYPE LDIST_NAME
43 
44 using namespace llvm;
45 
46 static cl::opt<bool>
47     LDistVerify("loop-distribute-verify", cl::Hidden,
48                 cl::desc("Turn on DominatorTree and LoopInfo verification "
49                          "after Loop Distribution"),
50                 cl::init(false));
51 
52 static cl::opt<bool> DistributeNonIfConvertible(
53     "loop-distribute-non-if-convertible", cl::Hidden,
54     cl::desc("Whether to distribute into a loop that may not be "
55              "if-convertible by the loop vectorizer"),
56     cl::init(false));
57 
58 static cl::opt<unsigned> DistributeSCEVCheckThreshold(
59     "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden,
60     cl::desc("The maximum number of SCEV checks allowed for Loop "
61              "Distribution"));
62 
63 static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold(
64     "loop-distribute-scev-check-threshold-with-pragma", cl::init(128),
65     cl::Hidden,
66     cl::desc(
67         "The maximum number of SCEV checks allowed for Loop "
68         "Distribution for loop marked with #pragma loop distribute(enable)"));
69 
70 // Note that the initial value for this depends on whether the pass is invoked
71 // directly or from the optimization pipeline.
72 static cl::opt<bool> EnableLoopDistribute(
73     "enable-loop-distribute", cl::Hidden,
74     cl::desc("Enable the new, experimental LoopDistribution Pass"));
75 
76 STATISTIC(NumLoopsDistributed, "Number of loops distributed");
77 
78 namespace {
79 /// \brief Maintains the set of instructions of the loop for a partition before
80 /// cloning.  After cloning, it hosts the new loop.
81 class InstPartition {
82   typedef SmallPtrSet<Instruction *, 8> InstructionSet;
83 
84 public:
85   InstPartition(Instruction *I, Loop *L, bool DepCycle = false)
86       : DepCycle(DepCycle), OrigLoop(L), ClonedLoop(nullptr) {
87     Set.insert(I);
88   }
89 
90   /// \brief Returns whether this partition contains a dependence cycle.
91   bool hasDepCycle() const { return DepCycle; }
92 
93   /// \brief Adds an instruction to this partition.
94   void add(Instruction *I) { Set.insert(I); }
95 
96   /// \brief Collection accessors.
97   InstructionSet::iterator begin() { return Set.begin(); }
98   InstructionSet::iterator end() { return Set.end(); }
99   InstructionSet::const_iterator begin() const { return Set.begin(); }
100   InstructionSet::const_iterator end() const { return Set.end(); }
101   bool empty() const { return Set.empty(); }
102 
103   /// \brief Moves this partition into \p Other.  This partition becomes empty
104   /// after this.
105   void moveTo(InstPartition &Other) {
106     Other.Set.insert(Set.begin(), Set.end());
107     Set.clear();
108     Other.DepCycle |= DepCycle;
109   }
110 
111   /// \brief Populates the partition with a transitive closure of all the
112   /// instructions that the seeded instructions dependent on.
113   void populateUsedSet() {
114     // FIXME: We currently don't use control-dependence but simply include all
115     // blocks (possibly empty at the end) and let simplifycfg mostly clean this
116     // up.
117     for (auto *B : OrigLoop->getBlocks())
118       Set.insert(B->getTerminator());
119 
120     // Follow the use-def chains to form a transitive closure of all the
121     // instructions that the originally seeded instructions depend on.
122     SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end());
123     while (!Worklist.empty()) {
124       Instruction *I = Worklist.pop_back_val();
125       // Insert instructions from the loop that we depend on.
126       for (Value *V : I->operand_values()) {
127         auto *I = dyn_cast<Instruction>(V);
128         if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second)
129           Worklist.push_back(I);
130       }
131     }
132   }
133 
134   /// \brief Clones the original loop.
135   ///
136   /// Updates LoopInfo and DominatorTree using the information that block \p
137   /// LoopDomBB dominates the loop.
138   Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB,
139                                unsigned Index, LoopInfo *LI,
140                                DominatorTree *DT) {
141     ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop,
142                                           VMap, Twine(".ldist") + Twine(Index),
143                                           LI, DT, ClonedLoopBlocks);
144     return ClonedLoop;
145   }
146 
147   /// \brief The cloned loop.  If this partition is mapped to the original loop,
148   /// this is null.
149   const Loop *getClonedLoop() const { return ClonedLoop; }
150 
151   /// \brief Returns the loop where this partition ends up after distribution.
152   /// If this partition is mapped to the original loop then use the block from
153   /// the loop.
154   const Loop *getDistributedLoop() const {
155     return ClonedLoop ? ClonedLoop : OrigLoop;
156   }
157 
158   /// \brief The VMap that is populated by cloning and then used in
159   /// remapinstruction to remap the cloned instructions.
160   ValueToValueMapTy &getVMap() { return VMap; }
161 
162   /// \brief Remaps the cloned instructions using VMap.
163   void remapInstructions() {
164     remapInstructionsInBlocks(ClonedLoopBlocks, VMap);
165   }
166 
167   /// \brief Based on the set of instructions selected for this partition,
168   /// removes the unnecessary ones.
169   void removeUnusedInsts() {
170     SmallVector<Instruction *, 8> Unused;
171 
172     for (auto *Block : OrigLoop->getBlocks())
173       for (auto &Inst : *Block)
174         if (!Set.count(&Inst)) {
175           Instruction *NewInst = &Inst;
176           if (!VMap.empty())
177             NewInst = cast<Instruction>(VMap[NewInst]);
178 
179           assert(!isa<BranchInst>(NewInst) &&
180                  "Branches are marked used early on");
181           Unused.push_back(NewInst);
182         }
183 
184     // Delete the instructions backwards, as it has a reduced likelihood of
185     // having to update as many def-use and use-def chains.
186     for (auto *Inst : make_range(Unused.rbegin(), Unused.rend())) {
187       if (!Inst->use_empty())
188         Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
189       Inst->eraseFromParent();
190     }
191   }
192 
193   void print() const {
194     if (DepCycle)
195       dbgs() << "  (cycle)\n";
196     for (auto *I : Set)
197       // Prefix with the block name.
198       dbgs() << "  " << I->getParent()->getName() << ":" << *I << "\n";
199   }
200 
201   void printBlocks() const {
202     for (auto *BB : getDistributedLoop()->getBlocks())
203       dbgs() << *BB;
204   }
205 
206 private:
207   /// \brief Instructions from OrigLoop selected for this partition.
208   InstructionSet Set;
209 
210   /// \brief Whether this partition contains a dependence cycle.
211   bool DepCycle;
212 
213   /// \brief The original loop.
214   Loop *OrigLoop;
215 
216   /// \brief The cloned loop.  If this partition is mapped to the original loop,
217   /// this is null.
218   Loop *ClonedLoop;
219 
220   /// \brief The blocks of ClonedLoop including the preheader.  If this
221   /// partition is mapped to the original loop, this is empty.
222   SmallVector<BasicBlock *, 8> ClonedLoopBlocks;
223 
224   /// \brief These gets populated once the set of instructions have been
225   /// finalized. If this partition is mapped to the original loop, these are not
226   /// set.
227   ValueToValueMapTy VMap;
228 };
229 
230 /// \brief Holds the set of Partitions.  It populates them, merges them and then
231 /// clones the loops.
232 class InstPartitionContainer {
233   typedef DenseMap<Instruction *, int> InstToPartitionIdT;
234 
235 public:
236   InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT)
237       : L(L), LI(LI), DT(DT) {}
238 
239   /// \brief Returns the number of partitions.
240   unsigned getSize() const { return PartitionContainer.size(); }
241 
242   /// \brief Adds \p Inst into the current partition if that is marked to
243   /// contain cycles.  Otherwise start a new partition for it.
244   void addToCyclicPartition(Instruction *Inst) {
245     // If the current partition is non-cyclic.  Start a new one.
246     if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle())
247       PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true);
248     else
249       PartitionContainer.back().add(Inst);
250   }
251 
252   /// \brief Adds \p Inst into a partition that is not marked to contain
253   /// dependence cycles.
254   ///
255   //  Initially we isolate memory instructions into as many partitions as
256   //  possible, then later we may merge them back together.
257   void addToNewNonCyclicPartition(Instruction *Inst) {
258     PartitionContainer.emplace_back(Inst, L);
259   }
260 
261   /// \brief Merges adjacent non-cyclic partitions.
262   ///
263   /// The idea is that we currently only want to isolate the non-vectorizable
264   /// partition.  We could later allow more distribution among these partition
265   /// too.
266   void mergeAdjacentNonCyclic() {
267     mergeAdjacentPartitionsIf(
268         [](const InstPartition *P) { return !P->hasDepCycle(); });
269   }
270 
271   /// \brief If a partition contains only conditional stores, we won't vectorize
272   /// it.  Try to merge it with a previous cyclic partition.
273   void mergeNonIfConvertible() {
274     mergeAdjacentPartitionsIf([&](const InstPartition *Partition) {
275       if (Partition->hasDepCycle())
276         return true;
277 
278       // Now, check if all stores are conditional in this partition.
279       bool seenStore = false;
280 
281       for (auto *Inst : *Partition)
282         if (isa<StoreInst>(Inst)) {
283           seenStore = true;
284           if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT))
285             return false;
286         }
287       return seenStore;
288     });
289   }
290 
291   /// \brief Merges the partitions according to various heuristics.
292   void mergeBeforePopulating() {
293     mergeAdjacentNonCyclic();
294     if (!DistributeNonIfConvertible)
295       mergeNonIfConvertible();
296   }
297 
298   /// \brief Merges partitions in order to ensure that no loads are duplicated.
299   ///
300   /// We can't duplicate loads because that could potentially reorder them.
301   /// LoopAccessAnalysis provides dependency information with the context that
302   /// the order of memory operation is preserved.
303   ///
304   /// Return if any partitions were merged.
305   bool mergeToAvoidDuplicatedLoads() {
306     typedef DenseMap<Instruction *, InstPartition *> LoadToPartitionT;
307     typedef EquivalenceClasses<InstPartition *> ToBeMergedT;
308 
309     LoadToPartitionT LoadToPartition;
310     ToBeMergedT ToBeMerged;
311 
312     // Step through the partitions and create equivalence between partitions
313     // that contain the same load.  Also put partitions in between them in the
314     // same equivalence class to avoid reordering of memory operations.
315     for (PartitionContainerT::iterator I = PartitionContainer.begin(),
316                                        E = PartitionContainer.end();
317          I != E; ++I) {
318       auto *PartI = &*I;
319 
320       // If a load occurs in two partitions PartI and PartJ, merge all
321       // partitions (PartI, PartJ] into PartI.
322       for (Instruction *Inst : *PartI)
323         if (isa<LoadInst>(Inst)) {
324           bool NewElt;
325           LoadToPartitionT::iterator LoadToPart;
326 
327           std::tie(LoadToPart, NewElt) =
328               LoadToPartition.insert(std::make_pair(Inst, PartI));
329           if (!NewElt) {
330             DEBUG(dbgs() << "Merging partitions due to this load in multiple "
331                          << "partitions: " << PartI << ", "
332                          << LoadToPart->second << "\n" << *Inst << "\n");
333 
334             auto PartJ = I;
335             do {
336               --PartJ;
337               ToBeMerged.unionSets(PartI, &*PartJ);
338             } while (&*PartJ != LoadToPart->second);
339           }
340         }
341     }
342     if (ToBeMerged.empty())
343       return false;
344 
345     // Merge the member of an equivalence class into its class leader.  This
346     // makes the members empty.
347     for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end();
348          I != E; ++I) {
349       if (!I->isLeader())
350         continue;
351 
352       auto PartI = I->getData();
353       for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)),
354                                    ToBeMerged.member_end())) {
355         PartJ->moveTo(*PartI);
356       }
357     }
358 
359     // Remove the empty partitions.
360     PartitionContainer.remove_if(
361         [](const InstPartition &P) { return P.empty(); });
362 
363     return true;
364   }
365 
366   /// \brief Sets up the mapping between instructions to partitions.  If the
367   /// instruction is duplicated across multiple partitions, set the entry to -1.
368   void setupPartitionIdOnInstructions() {
369     int PartitionID = 0;
370     for (const auto &Partition : PartitionContainer) {
371       for (Instruction *Inst : Partition) {
372         bool NewElt;
373         InstToPartitionIdT::iterator Iter;
374 
375         std::tie(Iter, NewElt) =
376             InstToPartitionId.insert(std::make_pair(Inst, PartitionID));
377         if (!NewElt)
378           Iter->second = -1;
379       }
380       ++PartitionID;
381     }
382   }
383 
384   /// \brief Populates the partition with everything that the seeding
385   /// instructions require.
386   void populateUsedSet() {
387     for (auto &P : PartitionContainer)
388       P.populateUsedSet();
389   }
390 
391   /// \brief This performs the main chunk of the work of cloning the loops for
392   /// the partitions.
393   void cloneLoops() {
394     BasicBlock *OrigPH = L->getLoopPreheader();
395     // At this point the predecessor of the preheader is either the memcheck
396     // block or the top part of the original preheader.
397     BasicBlock *Pred = OrigPH->getSinglePredecessor();
398     assert(Pred && "Preheader does not have a single predecessor");
399     BasicBlock *ExitBlock = L->getExitBlock();
400     assert(ExitBlock && "No single exit block");
401     Loop *NewLoop;
402 
403     assert(!PartitionContainer.empty() && "at least two partitions expected");
404     // We're cloning the preheader along with the loop so we already made sure
405     // it was empty.
406     assert(&*OrigPH->begin() == OrigPH->getTerminator() &&
407            "preheader not empty");
408 
409     // Create a loop for each partition except the last.  Clone the original
410     // loop before PH along with adding a preheader for the cloned loop.  Then
411     // update PH to point to the newly added preheader.
412     BasicBlock *TopPH = OrigPH;
413     unsigned Index = getSize() - 1;
414     for (auto I = std::next(PartitionContainer.rbegin()),
415               E = PartitionContainer.rend();
416          I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) {
417       auto *Part = &*I;
418 
419       NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT);
420 
421       Part->getVMap()[ExitBlock] = TopPH;
422       Part->remapInstructions();
423     }
424     Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH);
425 
426     // Now go in forward order and update the immediate dominator for the
427     // preheaders with the exiting block of the previous loop.  Dominance
428     // within the loop is updated in cloneLoopWithPreheader.
429     for (auto Curr = PartitionContainer.cbegin(),
430               Next = std::next(PartitionContainer.cbegin()),
431               E = PartitionContainer.cend();
432          Next != E; ++Curr, ++Next)
433       DT->changeImmediateDominator(
434           Next->getDistributedLoop()->getLoopPreheader(),
435           Curr->getDistributedLoop()->getExitingBlock());
436   }
437 
438   /// \brief Removes the dead instructions from the cloned loops.
439   void removeUnusedInsts() {
440     for (auto &Partition : PartitionContainer)
441       Partition.removeUnusedInsts();
442   }
443 
444   /// \brief For each memory pointer, it computes the partitionId the pointer is
445   /// used in.
446   ///
447   /// This returns an array of int where the I-th entry corresponds to I-th
448   /// entry in LAI.getRuntimePointerCheck().  If the pointer is used in multiple
449   /// partitions its entry is set to -1.
450   SmallVector<int, 8>
451   computePartitionSetForPointers(const LoopAccessInfo &LAI) {
452     const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking();
453 
454     unsigned N = RtPtrCheck->Pointers.size();
455     SmallVector<int, 8> PtrToPartitions(N);
456     for (unsigned I = 0; I < N; ++I) {
457       Value *Ptr = RtPtrCheck->Pointers[I].PointerValue;
458       auto Instructions =
459           LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr);
460 
461       int &Partition = PtrToPartitions[I];
462       // First set it to uninitialized.
463       Partition = -2;
464       for (Instruction *Inst : Instructions) {
465         // Note that this could be -1 if Inst is duplicated across multiple
466         // partitions.
467         int ThisPartition = this->InstToPartitionId[Inst];
468         if (Partition == -2)
469           Partition = ThisPartition;
470         // -1 means belonging to multiple partitions.
471         else if (Partition == -1)
472           break;
473         else if (Partition != (int)ThisPartition)
474           Partition = -1;
475       }
476       assert(Partition != -2 && "Pointer not belonging to any partition");
477     }
478 
479     return PtrToPartitions;
480   }
481 
482   void print(raw_ostream &OS) const {
483     unsigned Index = 0;
484     for (const auto &P : PartitionContainer) {
485       OS << "Partition " << Index++ << " (" << &P << "):\n";
486       P.print();
487     }
488   }
489 
490   void dump() const { print(dbgs()); }
491 
492 #ifndef NDEBUG
493   friend raw_ostream &operator<<(raw_ostream &OS,
494                                  const InstPartitionContainer &Partitions) {
495     Partitions.print(OS);
496     return OS;
497   }
498 #endif
499 
500   void printBlocks() const {
501     unsigned Index = 0;
502     for (const auto &P : PartitionContainer) {
503       dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n";
504       P.printBlocks();
505     }
506   }
507 
508 private:
509   typedef std::list<InstPartition> PartitionContainerT;
510 
511   /// \brief List of partitions.
512   PartitionContainerT PartitionContainer;
513 
514   /// \brief Mapping from Instruction to partition Id.  If the instruction
515   /// belongs to multiple partitions the entry contains -1.
516   InstToPartitionIdT InstToPartitionId;
517 
518   Loop *L;
519   LoopInfo *LI;
520   DominatorTree *DT;
521 
522   /// \brief The control structure to merge adjacent partitions if both satisfy
523   /// the \p Predicate.
524   template <class UnaryPredicate>
525   void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) {
526     InstPartition *PrevMatch = nullptr;
527     for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) {
528       auto DoesMatch = Predicate(&*I);
529       if (PrevMatch == nullptr && DoesMatch) {
530         PrevMatch = &*I;
531         ++I;
532       } else if (PrevMatch != nullptr && DoesMatch) {
533         I->moveTo(*PrevMatch);
534         I = PartitionContainer.erase(I);
535       } else {
536         PrevMatch = nullptr;
537         ++I;
538       }
539     }
540   }
541 };
542 
543 /// \brief For each memory instruction, this class maintains difference of the
544 /// number of unsafe dependences that start out from this instruction minus
545 /// those that end here.
546 ///
547 /// By traversing the memory instructions in program order and accumulating this
548 /// number, we know whether any unsafe dependence crosses over a program point.
549 class MemoryInstructionDependences {
550   typedef MemoryDepChecker::Dependence Dependence;
551 
552 public:
553   struct Entry {
554     Instruction *Inst;
555     unsigned NumUnsafeDependencesStartOrEnd;
556 
557     Entry(Instruction *Inst) : Inst(Inst), NumUnsafeDependencesStartOrEnd(0) {}
558   };
559 
560   typedef SmallVector<Entry, 8> AccessesType;
561 
562   AccessesType::const_iterator begin() const { return Accesses.begin(); }
563   AccessesType::const_iterator end() const { return Accesses.end(); }
564 
565   MemoryInstructionDependences(
566       const SmallVectorImpl<Instruction *> &Instructions,
567       const SmallVectorImpl<Dependence> &Dependences) {
568     Accesses.append(Instructions.begin(), Instructions.end());
569 
570     DEBUG(dbgs() << "Backward dependences:\n");
571     for (auto &Dep : Dependences)
572       if (Dep.isPossiblyBackward()) {
573         // Note that the designations source and destination follow the program
574         // order, i.e. source is always first.  (The direction is given by the
575         // DepType.)
576         ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd;
577         --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd;
578 
579         DEBUG(Dep.print(dbgs(), 2, Instructions));
580       }
581   }
582 
583 private:
584   AccessesType Accesses;
585 };
586 
587 /// \brief The actual class performing the per-loop work.
588 class LoopDistributeForLoop {
589 public:
590   LoopDistributeForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI,
591                         DominatorTree *DT, ScalarEvolution *SE)
592       : L(L), LI(LI), LAI(LAI), DT(DT), SE(SE) {
593     setForced();
594   }
595 
596   /// \brief Try to distribute an inner-most loop.
597   bool processLoop() {
598     assert(L->empty() && "Only process inner loops.");
599 
600     DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName()
601                  << "\" checking " << *L << "\n");
602 
603     BasicBlock *PH = L->getLoopPreheader();
604     if (!PH) {
605       DEBUG(dbgs() << "Skipping; no preheader");
606       return false;
607     }
608     if (!L->getExitBlock()) {
609       DEBUG(dbgs() << "Skipping; multiple exit blocks");
610       return false;
611     }
612     // LAA will check that we only have a single exiting block.
613 
614     // Currently, we only distribute to isolate the part of the loop with
615     // dependence cycles to enable partial vectorization.
616     if (LAI.canVectorizeMemory()) {
617       DEBUG(dbgs() << "Skipping; memory operations are safe for vectorization");
618       return false;
619     }
620     auto *Dependences = LAI.getDepChecker().getDependences();
621     if (!Dependences || Dependences->empty()) {
622       DEBUG(dbgs() << "Skipping; No unsafe dependences to isolate");
623       return false;
624     }
625 
626     InstPartitionContainer Partitions(L, LI, DT);
627 
628     // First, go through each memory operation and assign them to consecutive
629     // partitions (the order of partitions follows program order).  Put those
630     // with unsafe dependences into "cyclic" partition otherwise put each store
631     // in its own "non-cyclic" partition (we'll merge these later).
632     //
633     // Note that a memory operation (e.g. Load2 below) at a program point that
634     // has an unsafe dependence (Store3->Load1) spanning over it must be
635     // included in the same cyclic partition as the dependent operations.  This
636     // is to preserve the original program order after distribution.  E.g.:
637     //
638     //                NumUnsafeDependencesStartOrEnd  NumUnsafeDependencesActive
639     //  Load1   -.                     1                       0->1
640     //  Load2    | /Unsafe/            0                       1
641     //  Store3  -'                    -1                       1->0
642     //  Load4                          0                       0
643     //
644     // NumUnsafeDependencesActive > 0 indicates this situation and in this case
645     // we just keep assigning to the same cyclic partition until
646     // NumUnsafeDependencesActive reaches 0.
647     const MemoryDepChecker &DepChecker = LAI.getDepChecker();
648     MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(),
649                                      *Dependences);
650 
651     int NumUnsafeDependencesActive = 0;
652     for (auto &InstDep : MID) {
653       Instruction *I = InstDep.Inst;
654       // We update NumUnsafeDependencesActive post-instruction, catch the
655       // start of a dependence directly via NumUnsafeDependencesStartOrEnd.
656       if (NumUnsafeDependencesActive ||
657           InstDep.NumUnsafeDependencesStartOrEnd > 0)
658         Partitions.addToCyclicPartition(I);
659       else
660         Partitions.addToNewNonCyclicPartition(I);
661       NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd;
662       assert(NumUnsafeDependencesActive >= 0 &&
663              "Negative number of dependences active");
664     }
665 
666     // Add partitions for values used outside.  These partitions can be out of
667     // order from the original program order.  This is OK because if the
668     // partition uses a load we will merge this partition with the original
669     // partition of the load that we set up in the previous loop (see
670     // mergeToAvoidDuplicatedLoads).
671     auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L);
672     for (auto *Inst : DefsUsedOutside)
673       Partitions.addToNewNonCyclicPartition(Inst);
674 
675     DEBUG(dbgs() << "Seeded partitions:\n" << Partitions);
676     if (Partitions.getSize() < 2)
677       return false;
678 
679     // Run the merge heuristics: Merge non-cyclic adjacent partitions since we
680     // should be able to vectorize these together.
681     Partitions.mergeBeforePopulating();
682     DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions);
683     if (Partitions.getSize() < 2)
684       return false;
685 
686     // Now, populate the partitions with non-memory operations.
687     Partitions.populateUsedSet();
688     DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions);
689 
690     // In order to preserve original lexical order for loads, keep them in the
691     // partition that we set up in the MemoryInstructionDependences loop.
692     if (Partitions.mergeToAvoidDuplicatedLoads()) {
693       DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n"
694                    << Partitions);
695       if (Partitions.getSize() < 2)
696         return false;
697     }
698 
699     // Don't distribute the loop if we need too many SCEV run-time checks.
700     const SCEVUnionPredicate &Pred = LAI.PSE.getUnionPredicate();
701     if (Pred.getComplexity() > (IsForced.getValueOr(false)
702                                     ? PragmaDistributeSCEVCheckThreshold
703                                     : DistributeSCEVCheckThreshold)) {
704       DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
705       return false;
706     }
707 
708     DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n");
709     // We're done forming the partitions set up the reverse mapping from
710     // instructions to partitions.
711     Partitions.setupPartitionIdOnInstructions();
712 
713     // To keep things simple have an empty preheader before we version or clone
714     // the loop.  (Also split if this has no predecessor, i.e. entry, because we
715     // rely on PH having a predecessor.)
716     if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator())
717       SplitBlock(PH, PH->getTerminator(), DT, LI);
718 
719     // If we need run-time checks, version the loop now.
720     auto PtrToPartition = Partitions.computePartitionSetForPointers(LAI);
721     const auto *RtPtrChecking = LAI.getRuntimePointerChecking();
722     const auto &AllChecks = RtPtrChecking->getChecks();
723     auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition,
724                                                   RtPtrChecking);
725 
726     if (!Pred.isAlwaysTrue() || !Checks.empty()) {
727       DEBUG(dbgs() << "\nPointers:\n");
728       DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks));
729       LoopVersioning LVer(LAI, L, LI, DT, SE, false);
730       LVer.setAliasChecks(std::move(Checks));
731       LVer.setSCEVChecks(LAI.PSE.getUnionPredicate());
732       LVer.versionLoop(DefsUsedOutside);
733       LVer.annotateLoopWithNoAlias();
734     }
735 
736     // Create identical copies of the original loop for each partition and hook
737     // them up sequentially.
738     Partitions.cloneLoops();
739 
740     // Now, we remove the instruction from each loop that don't belong to that
741     // partition.
742     Partitions.removeUnusedInsts();
743     DEBUG(dbgs() << "\nAfter removing unused Instrs:\n");
744     DEBUG(Partitions.printBlocks());
745 
746     if (LDistVerify) {
747       LI->verify();
748       DT->verifyDomTree();
749     }
750 
751     ++NumLoopsDistributed;
752     return true;
753   }
754 
755   /// \brief Return if distribution forced to be enabled/disabled for the loop.
756   ///
757   /// If the optional has a value, it indicates whether distribution was forced
758   /// to be enabled (true) or disabled (false).  If the optional has no value
759   /// distribution was not forced either way.
760   const Optional<bool> &isForced() const { return IsForced; }
761 
762 private:
763   /// \brief Filter out checks between pointers from the same partition.
764   ///
765   /// \p PtrToPartition contains the partition number for pointers.  Partition
766   /// number -1 means that the pointer is used in multiple partitions.  In this
767   /// case we can't safely omit the check.
768   SmallVector<RuntimePointerChecking::PointerCheck, 4>
769   includeOnlyCrossPartitionChecks(
770       const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks,
771       const SmallVectorImpl<int> &PtrToPartition,
772       const RuntimePointerChecking *RtPtrChecking) {
773     SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
774 
775     std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks),
776                  [&](const RuntimePointerChecking::PointerCheck &Check) {
777                    for (unsigned PtrIdx1 : Check.first->Members)
778                      for (unsigned PtrIdx2 : Check.second->Members)
779                        // Only include this check if there is a pair of pointers
780                        // that require checking and the pointers fall into
781                        // separate partitions.
782                        //
783                        // (Note that we already know at this point that the two
784                        // pointer groups need checking but it doesn't follow
785                        // that each pair of pointers within the two groups need
786                        // checking as well.
787                        //
788                        // In other words we don't want to include a check just
789                        // because there is a pair of pointers between the two
790                        // pointer groups that require checks and a different
791                        // pair whose pointers fall into different partitions.)
792                        if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
793                            !RuntimePointerChecking::arePointersInSamePartition(
794                                PtrToPartition, PtrIdx1, PtrIdx2))
795                          return true;
796                    return false;
797                  });
798 
799     return Checks;
800   }
801 
802   /// \brief Check whether the loop metadata is forcing distribution to be
803   /// enabled/disabled.
804   void setForced() {
805     Optional<const MDOperand *> Value =
806         findStringMetadataForLoop(L, "llvm.loop.distribute.enable");
807     if (!Value)
808       return;
809 
810     const MDOperand *Op = *Value;
811     assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
812     IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue();
813   }
814 
815   // Analyses used.
816   Loop *L;
817   LoopInfo *LI;
818   const LoopAccessInfo &LAI;
819   DominatorTree *DT;
820   ScalarEvolution *SE;
821 
822   /// \brief Indicates whether distribution is forced to be enabled/disabled for
823   /// the loop.
824   ///
825   /// If the optional has a value, it indicates whether distribution was forced
826   /// to be enabled (true) or disabled (false).  If the optional has no value
827   /// distribution was not forced either way.
828   Optional<bool> IsForced;
829 };
830 
831 /// \brief The pass class.
832 class LoopDistribute : public FunctionPass {
833 public:
834   /// \p ProcessAllLoopsByDefault specifies whether loop distribution should be
835   /// performed by default.  Pass -enable-loop-distribute={0,1} overrides this
836   /// default.  We use this to keep LoopDistribution off by default when invoked
837   /// from the optimization pipeline but on when invoked explicitly from opt.
838   LoopDistribute(bool ProcessAllLoopsByDefault = true)
839       : FunctionPass(ID), ProcessAllLoops(ProcessAllLoopsByDefault) {
840     // The default is set by the caller.
841     if (EnableLoopDistribute.getNumOccurrences() > 0)
842       ProcessAllLoops = EnableLoopDistribute;
843     initializeLoopDistributePass(*PassRegistry::getPassRegistry());
844   }
845 
846   bool runOnFunction(Function &F) override {
847     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
848     auto *LAA = &getAnalysis<LoopAccessAnalysis>();
849     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
850     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
851 
852     // Build up a worklist of inner-loops to vectorize. This is necessary as the
853     // act of distributing a loop creates new loops and can invalidate iterators
854     // across the loops.
855     SmallVector<Loop *, 8> Worklist;
856 
857     for (Loop *TopLevelLoop : *LI)
858       for (Loop *L : depth_first(TopLevelLoop))
859         // We only handle inner-most loops.
860         if (L->empty())
861           Worklist.push_back(L);
862 
863     // Now walk the identified inner loops.
864     bool Changed = false;
865     for (Loop *L : Worklist) {
866       const LoopAccessInfo &LAI = LAA->getInfo(L, ValueToValueMap());
867       LoopDistributeForLoop LDL(L, LI, LAI, DT, SE);
868 
869       // If distribution was forced for the specific loop to be
870       // enabled/disabled, follow that.  Otherwise use the global flag.
871       if (LDL.isForced().getValueOr(ProcessAllLoops))
872         Changed |= LDL.processLoop();
873     }
874 
875     // Process each loop nest in the function.
876     return Changed;
877   }
878 
879   void getAnalysisUsage(AnalysisUsage &AU) const override {
880     AU.addRequired<ScalarEvolutionWrapperPass>();
881     AU.addRequired<LoopInfoWrapperPass>();
882     AU.addPreserved<LoopInfoWrapperPass>();
883     AU.addRequired<LoopAccessAnalysis>();
884     AU.addRequired<DominatorTreeWrapperPass>();
885     AU.addPreserved<DominatorTreeWrapperPass>();
886   }
887 
888   static char ID;
889 
890 private:
891   /// \brief Whether distribution should be on in this function.  The per-loop
892   /// pragma can override this.
893   bool ProcessAllLoops;
894 };
895 } // anonymous namespace
896 
897 char LoopDistribute::ID;
898 static const char ldist_name[] = "Loop Distribition";
899 
900 INITIALIZE_PASS_BEGIN(LoopDistribute, LDIST_NAME, ldist_name, false, false)
901 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
902 INITIALIZE_PASS_DEPENDENCY(LoopAccessAnalysis)
903 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
904 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
905 INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false)
906 
907 namespace llvm {
908 FunctionPass *createLoopDistributePass(bool ProcessAllLoopsByDefault) {
909   return new LoopDistribute(ProcessAllLoopsByDefault);
910 }
911 }
912