xref: /llvm-project/llvm/lib/Transforms/Scalar/LoopDistribute.cpp (revision 79ac42a5c99135ed6ecf8c011471d42ec5269f6c)
1 //===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements the Loop Distribution Pass.  Its main focus is to
11 // distribute loops that cannot be vectorized due to dependence cycles.  It
12 // tries to isolate the offending dependences into a new loop allowing
13 // vectorization of the remaining parts.
14 //
15 // For dependence analysis, the pass uses the LoopVectorizer's
16 // LoopAccessAnalysis.  Because this analysis presumes no change in the order of
17 // memory operations, special care is taken to preserve the lexical order of
18 // these operations.
19 //
20 // Similarly to the Vectorizer, the pass also supports loop versioning to
21 // run-time disambiguate potentially overlapping arrays.
22 //
23 //===----------------------------------------------------------------------===//
24 
25 #include "llvm/ADT/DepthFirstIterator.h"
26 #include "llvm/ADT/EquivalenceClasses.h"
27 #include "llvm/ADT/STLExtras.h"
28 #include "llvm/ADT/Statistic.h"
29 #include "llvm/Analysis/BlockFrequencyInfo.h"
30 #include "llvm/Analysis/LoopAccessAnalysis.h"
31 #include "llvm/Analysis/LoopInfo.h"
32 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
33 #include "llvm/IR/DiagnosticInfo.h"
34 #include "llvm/IR/Dominators.h"
35 #include "llvm/Pass.h"
36 #include "llvm/Support/CommandLine.h"
37 #include "llvm/Support/Debug.h"
38 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
39 #include "llvm/Transforms/Utils/Cloning.h"
40 #include "llvm/Transforms/Utils/LoopUtils.h"
41 #include "llvm/Transforms/Utils/LoopVersioning.h"
42 #include <list>
43 
44 #define LDIST_NAME "loop-distribute"
45 #define DEBUG_TYPE LDIST_NAME
46 
47 using namespace llvm;
48 
49 static cl::opt<bool>
50     LDistVerify("loop-distribute-verify", cl::Hidden,
51                 cl::desc("Turn on DominatorTree and LoopInfo verification "
52                          "after Loop Distribution"),
53                 cl::init(false));
54 
55 static cl::opt<bool> DistributeNonIfConvertible(
56     "loop-distribute-non-if-convertible", cl::Hidden,
57     cl::desc("Whether to distribute into a loop that may not be "
58              "if-convertible by the loop vectorizer"),
59     cl::init(false));
60 
61 static cl::opt<unsigned> DistributeSCEVCheckThreshold(
62     "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden,
63     cl::desc("The maximum number of SCEV checks allowed for Loop "
64              "Distribution"));
65 
66 static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold(
67     "loop-distribute-scev-check-threshold-with-pragma", cl::init(128),
68     cl::Hidden,
69     cl::desc(
70         "The maximum number of SCEV checks allowed for Loop "
71         "Distribution for loop marked with #pragma loop distribute(enable)"));
72 
73 // Note that the initial value for this depends on whether the pass is invoked
74 // directly or from the optimization pipeline.
75 static cl::opt<bool> EnableLoopDistribute(
76     "enable-loop-distribute", cl::Hidden,
77     cl::desc("Enable the new, experimental LoopDistribution Pass"));
78 
79 STATISTIC(NumLoopsDistributed, "Number of loops distributed");
80 
81 namespace {
82 /// \brief Maintains the set of instructions of the loop for a partition before
83 /// cloning.  After cloning, it hosts the new loop.
84 class InstPartition {
85   typedef SmallPtrSet<Instruction *, 8> InstructionSet;
86 
87 public:
88   InstPartition(Instruction *I, Loop *L, bool DepCycle = false)
89       : DepCycle(DepCycle), OrigLoop(L), ClonedLoop(nullptr) {
90     Set.insert(I);
91   }
92 
93   /// \brief Returns whether this partition contains a dependence cycle.
94   bool hasDepCycle() const { return DepCycle; }
95 
96   /// \brief Adds an instruction to this partition.
97   void add(Instruction *I) { Set.insert(I); }
98 
99   /// \brief Collection accessors.
100   InstructionSet::iterator begin() { return Set.begin(); }
101   InstructionSet::iterator end() { return Set.end(); }
102   InstructionSet::const_iterator begin() const { return Set.begin(); }
103   InstructionSet::const_iterator end() const { return Set.end(); }
104   bool empty() const { return Set.empty(); }
105 
106   /// \brief Moves this partition into \p Other.  This partition becomes empty
107   /// after this.
108   void moveTo(InstPartition &Other) {
109     Other.Set.insert(Set.begin(), Set.end());
110     Set.clear();
111     Other.DepCycle |= DepCycle;
112   }
113 
114   /// \brief Populates the partition with a transitive closure of all the
115   /// instructions that the seeded instructions dependent on.
116   void populateUsedSet() {
117     // FIXME: We currently don't use control-dependence but simply include all
118     // blocks (possibly empty at the end) and let simplifycfg mostly clean this
119     // up.
120     for (auto *B : OrigLoop->getBlocks())
121       Set.insert(B->getTerminator());
122 
123     // Follow the use-def chains to form a transitive closure of all the
124     // instructions that the originally seeded instructions depend on.
125     SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end());
126     while (!Worklist.empty()) {
127       Instruction *I = Worklist.pop_back_val();
128       // Insert instructions from the loop that we depend on.
129       for (Value *V : I->operand_values()) {
130         auto *I = dyn_cast<Instruction>(V);
131         if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second)
132           Worklist.push_back(I);
133       }
134     }
135   }
136 
137   /// \brief Clones the original loop.
138   ///
139   /// Updates LoopInfo and DominatorTree using the information that block \p
140   /// LoopDomBB dominates the loop.
141   Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB,
142                                unsigned Index, LoopInfo *LI,
143                                DominatorTree *DT) {
144     ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop,
145                                           VMap, Twine(".ldist") + Twine(Index),
146                                           LI, DT, ClonedLoopBlocks);
147     return ClonedLoop;
148   }
149 
150   /// \brief The cloned loop.  If this partition is mapped to the original loop,
151   /// this is null.
152   const Loop *getClonedLoop() const { return ClonedLoop; }
153 
154   /// \brief Returns the loop where this partition ends up after distribution.
155   /// If this partition is mapped to the original loop then use the block from
156   /// the loop.
157   const Loop *getDistributedLoop() const {
158     return ClonedLoop ? ClonedLoop : OrigLoop;
159   }
160 
161   /// \brief The VMap that is populated by cloning and then used in
162   /// remapinstruction to remap the cloned instructions.
163   ValueToValueMapTy &getVMap() { return VMap; }
164 
165   /// \brief Remaps the cloned instructions using VMap.
166   void remapInstructions() {
167     remapInstructionsInBlocks(ClonedLoopBlocks, VMap);
168   }
169 
170   /// \brief Based on the set of instructions selected for this partition,
171   /// removes the unnecessary ones.
172   void removeUnusedInsts() {
173     SmallVector<Instruction *, 8> Unused;
174 
175     for (auto *Block : OrigLoop->getBlocks())
176       for (auto &Inst : *Block)
177         if (!Set.count(&Inst)) {
178           Instruction *NewInst = &Inst;
179           if (!VMap.empty())
180             NewInst = cast<Instruction>(VMap[NewInst]);
181 
182           assert(!isa<BranchInst>(NewInst) &&
183                  "Branches are marked used early on");
184           Unused.push_back(NewInst);
185         }
186 
187     // Delete the instructions backwards, as it has a reduced likelihood of
188     // having to update as many def-use and use-def chains.
189     for (auto *Inst : reverse(Unused)) {
190       if (!Inst->use_empty())
191         Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
192       Inst->eraseFromParent();
193     }
194   }
195 
196   void print() const {
197     if (DepCycle)
198       dbgs() << "  (cycle)\n";
199     for (auto *I : Set)
200       // Prefix with the block name.
201       dbgs() << "  " << I->getParent()->getName() << ":" << *I << "\n";
202   }
203 
204   void printBlocks() const {
205     for (auto *BB : getDistributedLoop()->getBlocks())
206       dbgs() << *BB;
207   }
208 
209 private:
210   /// \brief Instructions from OrigLoop selected for this partition.
211   InstructionSet Set;
212 
213   /// \brief Whether this partition contains a dependence cycle.
214   bool DepCycle;
215 
216   /// \brief The original loop.
217   Loop *OrigLoop;
218 
219   /// \brief The cloned loop.  If this partition is mapped to the original loop,
220   /// this is null.
221   Loop *ClonedLoop;
222 
223   /// \brief The blocks of ClonedLoop including the preheader.  If this
224   /// partition is mapped to the original loop, this is empty.
225   SmallVector<BasicBlock *, 8> ClonedLoopBlocks;
226 
227   /// \brief These gets populated once the set of instructions have been
228   /// finalized. If this partition is mapped to the original loop, these are not
229   /// set.
230   ValueToValueMapTy VMap;
231 };
232 
233 /// \brief Holds the set of Partitions.  It populates them, merges them and then
234 /// clones the loops.
235 class InstPartitionContainer {
236   typedef DenseMap<Instruction *, int> InstToPartitionIdT;
237 
238 public:
239   InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT)
240       : L(L), LI(LI), DT(DT) {}
241 
242   /// \brief Returns the number of partitions.
243   unsigned getSize() const { return PartitionContainer.size(); }
244 
245   /// \brief Adds \p Inst into the current partition if that is marked to
246   /// contain cycles.  Otherwise start a new partition for it.
247   void addToCyclicPartition(Instruction *Inst) {
248     // If the current partition is non-cyclic.  Start a new one.
249     if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle())
250       PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true);
251     else
252       PartitionContainer.back().add(Inst);
253   }
254 
255   /// \brief Adds \p Inst into a partition that is not marked to contain
256   /// dependence cycles.
257   ///
258   //  Initially we isolate memory instructions into as many partitions as
259   //  possible, then later we may merge them back together.
260   void addToNewNonCyclicPartition(Instruction *Inst) {
261     PartitionContainer.emplace_back(Inst, L);
262   }
263 
264   /// \brief Merges adjacent non-cyclic partitions.
265   ///
266   /// The idea is that we currently only want to isolate the non-vectorizable
267   /// partition.  We could later allow more distribution among these partition
268   /// too.
269   void mergeAdjacentNonCyclic() {
270     mergeAdjacentPartitionsIf(
271         [](const InstPartition *P) { return !P->hasDepCycle(); });
272   }
273 
274   /// \brief If a partition contains only conditional stores, we won't vectorize
275   /// it.  Try to merge it with a previous cyclic partition.
276   void mergeNonIfConvertible() {
277     mergeAdjacentPartitionsIf([&](const InstPartition *Partition) {
278       if (Partition->hasDepCycle())
279         return true;
280 
281       // Now, check if all stores are conditional in this partition.
282       bool seenStore = false;
283 
284       for (auto *Inst : *Partition)
285         if (isa<StoreInst>(Inst)) {
286           seenStore = true;
287           if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT))
288             return false;
289         }
290       return seenStore;
291     });
292   }
293 
294   /// \brief Merges the partitions according to various heuristics.
295   void mergeBeforePopulating() {
296     mergeAdjacentNonCyclic();
297     if (!DistributeNonIfConvertible)
298       mergeNonIfConvertible();
299   }
300 
301   /// \brief Merges partitions in order to ensure that no loads are duplicated.
302   ///
303   /// We can't duplicate loads because that could potentially reorder them.
304   /// LoopAccessAnalysis provides dependency information with the context that
305   /// the order of memory operation is preserved.
306   ///
307   /// Return if any partitions were merged.
308   bool mergeToAvoidDuplicatedLoads() {
309     typedef DenseMap<Instruction *, InstPartition *> LoadToPartitionT;
310     typedef EquivalenceClasses<InstPartition *> ToBeMergedT;
311 
312     LoadToPartitionT LoadToPartition;
313     ToBeMergedT ToBeMerged;
314 
315     // Step through the partitions and create equivalence between partitions
316     // that contain the same load.  Also put partitions in between them in the
317     // same equivalence class to avoid reordering of memory operations.
318     for (PartitionContainerT::iterator I = PartitionContainer.begin(),
319                                        E = PartitionContainer.end();
320          I != E; ++I) {
321       auto *PartI = &*I;
322 
323       // If a load occurs in two partitions PartI and PartJ, merge all
324       // partitions (PartI, PartJ] into PartI.
325       for (Instruction *Inst : *PartI)
326         if (isa<LoadInst>(Inst)) {
327           bool NewElt;
328           LoadToPartitionT::iterator LoadToPart;
329 
330           std::tie(LoadToPart, NewElt) =
331               LoadToPartition.insert(std::make_pair(Inst, PartI));
332           if (!NewElt) {
333             DEBUG(dbgs() << "Merging partitions due to this load in multiple "
334                          << "partitions: " << PartI << ", "
335                          << LoadToPart->second << "\n" << *Inst << "\n");
336 
337             auto PartJ = I;
338             do {
339               --PartJ;
340               ToBeMerged.unionSets(PartI, &*PartJ);
341             } while (&*PartJ != LoadToPart->second);
342           }
343         }
344     }
345     if (ToBeMerged.empty())
346       return false;
347 
348     // Merge the member of an equivalence class into its class leader.  This
349     // makes the members empty.
350     for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end();
351          I != E; ++I) {
352       if (!I->isLeader())
353         continue;
354 
355       auto PartI = I->getData();
356       for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)),
357                                    ToBeMerged.member_end())) {
358         PartJ->moveTo(*PartI);
359       }
360     }
361 
362     // Remove the empty partitions.
363     PartitionContainer.remove_if(
364         [](const InstPartition &P) { return P.empty(); });
365 
366     return true;
367   }
368 
369   /// \brief Sets up the mapping between instructions to partitions.  If the
370   /// instruction is duplicated across multiple partitions, set the entry to -1.
371   void setupPartitionIdOnInstructions() {
372     int PartitionID = 0;
373     for (const auto &Partition : PartitionContainer) {
374       for (Instruction *Inst : Partition) {
375         bool NewElt;
376         InstToPartitionIdT::iterator Iter;
377 
378         std::tie(Iter, NewElt) =
379             InstToPartitionId.insert(std::make_pair(Inst, PartitionID));
380         if (!NewElt)
381           Iter->second = -1;
382       }
383       ++PartitionID;
384     }
385   }
386 
387   /// \brief Populates the partition with everything that the seeding
388   /// instructions require.
389   void populateUsedSet() {
390     for (auto &P : PartitionContainer)
391       P.populateUsedSet();
392   }
393 
394   /// \brief This performs the main chunk of the work of cloning the loops for
395   /// the partitions.
396   void cloneLoops() {
397     BasicBlock *OrigPH = L->getLoopPreheader();
398     // At this point the predecessor of the preheader is either the memcheck
399     // block or the top part of the original preheader.
400     BasicBlock *Pred = OrigPH->getSinglePredecessor();
401     assert(Pred && "Preheader does not have a single predecessor");
402     BasicBlock *ExitBlock = L->getExitBlock();
403     assert(ExitBlock && "No single exit block");
404     Loop *NewLoop;
405 
406     assert(!PartitionContainer.empty() && "at least two partitions expected");
407     // We're cloning the preheader along with the loop so we already made sure
408     // it was empty.
409     assert(&*OrigPH->begin() == OrigPH->getTerminator() &&
410            "preheader not empty");
411 
412     // Create a loop for each partition except the last.  Clone the original
413     // loop before PH along with adding a preheader for the cloned loop.  Then
414     // update PH to point to the newly added preheader.
415     BasicBlock *TopPH = OrigPH;
416     unsigned Index = getSize() - 1;
417     for (auto I = std::next(PartitionContainer.rbegin()),
418               E = PartitionContainer.rend();
419          I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) {
420       auto *Part = &*I;
421 
422       NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT);
423 
424       Part->getVMap()[ExitBlock] = TopPH;
425       Part->remapInstructions();
426     }
427     Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH);
428 
429     // Now go in forward order and update the immediate dominator for the
430     // preheaders with the exiting block of the previous loop.  Dominance
431     // within the loop is updated in cloneLoopWithPreheader.
432     for (auto Curr = PartitionContainer.cbegin(),
433               Next = std::next(PartitionContainer.cbegin()),
434               E = PartitionContainer.cend();
435          Next != E; ++Curr, ++Next)
436       DT->changeImmediateDominator(
437           Next->getDistributedLoop()->getLoopPreheader(),
438           Curr->getDistributedLoop()->getExitingBlock());
439   }
440 
441   /// \brief Removes the dead instructions from the cloned loops.
442   void removeUnusedInsts() {
443     for (auto &Partition : PartitionContainer)
444       Partition.removeUnusedInsts();
445   }
446 
447   /// \brief For each memory pointer, it computes the partitionId the pointer is
448   /// used in.
449   ///
450   /// This returns an array of int where the I-th entry corresponds to I-th
451   /// entry in LAI.getRuntimePointerCheck().  If the pointer is used in multiple
452   /// partitions its entry is set to -1.
453   SmallVector<int, 8>
454   computePartitionSetForPointers(const LoopAccessInfo &LAI) {
455     const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking();
456 
457     unsigned N = RtPtrCheck->Pointers.size();
458     SmallVector<int, 8> PtrToPartitions(N);
459     for (unsigned I = 0; I < N; ++I) {
460       Value *Ptr = RtPtrCheck->Pointers[I].PointerValue;
461       auto Instructions =
462           LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr);
463 
464       int &Partition = PtrToPartitions[I];
465       // First set it to uninitialized.
466       Partition = -2;
467       for (Instruction *Inst : Instructions) {
468         // Note that this could be -1 if Inst is duplicated across multiple
469         // partitions.
470         int ThisPartition = this->InstToPartitionId[Inst];
471         if (Partition == -2)
472           Partition = ThisPartition;
473         // -1 means belonging to multiple partitions.
474         else if (Partition == -1)
475           break;
476         else if (Partition != (int)ThisPartition)
477           Partition = -1;
478       }
479       assert(Partition != -2 && "Pointer not belonging to any partition");
480     }
481 
482     return PtrToPartitions;
483   }
484 
485   void print(raw_ostream &OS) const {
486     unsigned Index = 0;
487     for (const auto &P : PartitionContainer) {
488       OS << "Partition " << Index++ << " (" << &P << "):\n";
489       P.print();
490     }
491   }
492 
493   void dump() const { print(dbgs()); }
494 
495 #ifndef NDEBUG
496   friend raw_ostream &operator<<(raw_ostream &OS,
497                                  const InstPartitionContainer &Partitions) {
498     Partitions.print(OS);
499     return OS;
500   }
501 #endif
502 
503   void printBlocks() const {
504     unsigned Index = 0;
505     for (const auto &P : PartitionContainer) {
506       dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n";
507       P.printBlocks();
508     }
509   }
510 
511 private:
512   typedef std::list<InstPartition> PartitionContainerT;
513 
514   /// \brief List of partitions.
515   PartitionContainerT PartitionContainer;
516 
517   /// \brief Mapping from Instruction to partition Id.  If the instruction
518   /// belongs to multiple partitions the entry contains -1.
519   InstToPartitionIdT InstToPartitionId;
520 
521   Loop *L;
522   LoopInfo *LI;
523   DominatorTree *DT;
524 
525   /// \brief The control structure to merge adjacent partitions if both satisfy
526   /// the \p Predicate.
527   template <class UnaryPredicate>
528   void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) {
529     InstPartition *PrevMatch = nullptr;
530     for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) {
531       auto DoesMatch = Predicate(&*I);
532       if (PrevMatch == nullptr && DoesMatch) {
533         PrevMatch = &*I;
534         ++I;
535       } else if (PrevMatch != nullptr && DoesMatch) {
536         I->moveTo(*PrevMatch);
537         I = PartitionContainer.erase(I);
538       } else {
539         PrevMatch = nullptr;
540         ++I;
541       }
542     }
543   }
544 };
545 
546 /// \brief For each memory instruction, this class maintains difference of the
547 /// number of unsafe dependences that start out from this instruction minus
548 /// those that end here.
549 ///
550 /// By traversing the memory instructions in program order and accumulating this
551 /// number, we know whether any unsafe dependence crosses over a program point.
552 class MemoryInstructionDependences {
553   typedef MemoryDepChecker::Dependence Dependence;
554 
555 public:
556   struct Entry {
557     Instruction *Inst;
558     unsigned NumUnsafeDependencesStartOrEnd;
559 
560     Entry(Instruction *Inst) : Inst(Inst), NumUnsafeDependencesStartOrEnd(0) {}
561   };
562 
563   typedef SmallVector<Entry, 8> AccessesType;
564 
565   AccessesType::const_iterator begin() const { return Accesses.begin(); }
566   AccessesType::const_iterator end() const { return Accesses.end(); }
567 
568   MemoryInstructionDependences(
569       const SmallVectorImpl<Instruction *> &Instructions,
570       const SmallVectorImpl<Dependence> &Dependences) {
571     Accesses.append(Instructions.begin(), Instructions.end());
572 
573     DEBUG(dbgs() << "Backward dependences:\n");
574     for (auto &Dep : Dependences)
575       if (Dep.isPossiblyBackward()) {
576         // Note that the designations source and destination follow the program
577         // order, i.e. source is always first.  (The direction is given by the
578         // DepType.)
579         ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd;
580         --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd;
581 
582         DEBUG(Dep.print(dbgs(), 2, Instructions));
583       }
584   }
585 
586 private:
587   AccessesType Accesses;
588 };
589 
590 /// \brief The actual class performing the per-loop work.
591 class LoopDistributeForLoop {
592 public:
593   LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT,
594                         ScalarEvolution *SE, OptimizationRemarkEmitter *ORE)
595       : L(L), F(F), LI(LI), LAI(nullptr), DT(DT), SE(SE), ORE(ORE) {
596     setForced();
597   }
598 
599   /// \brief Try to distribute an inner-most loop.
600   bool processLoop(LoopAccessLegacyAnalysis *LAA) {
601     assert(L->empty() && "Only process inner loops.");
602 
603     DEBUG(dbgs() << "\nLDist: In \"" << L->getHeader()->getParent()->getName()
604                  << "\" checking " << *L << "\n");
605 
606     BasicBlock *PH = L->getLoopPreheader();
607     if (!PH)
608       return fail("no preheader");
609     if (!L->getExitBlock())
610       return fail("multiple exit blocks");
611 
612     // LAA will check that we only have a single exiting block.
613     LAI = &LAA->getInfo(L);
614 
615     // Currently, we only distribute to isolate the part of the loop with
616     // dependence cycles to enable partial vectorization.
617     if (LAI->canVectorizeMemory())
618       return fail("memory operations are safe for vectorization");
619 
620     auto *Dependences = LAI->getDepChecker().getDependences();
621     if (!Dependences || Dependences->empty())
622       return fail("no unsafe dependences to isolate");
623 
624     InstPartitionContainer Partitions(L, LI, DT);
625 
626     // First, go through each memory operation and assign them to consecutive
627     // partitions (the order of partitions follows program order).  Put those
628     // with unsafe dependences into "cyclic" partition otherwise put each store
629     // in its own "non-cyclic" partition (we'll merge these later).
630     //
631     // Note that a memory operation (e.g. Load2 below) at a program point that
632     // has an unsafe dependence (Store3->Load1) spanning over it must be
633     // included in the same cyclic partition as the dependent operations.  This
634     // is to preserve the original program order after distribution.  E.g.:
635     //
636     //                NumUnsafeDependencesStartOrEnd  NumUnsafeDependencesActive
637     //  Load1   -.                     1                       0->1
638     //  Load2    | /Unsafe/            0                       1
639     //  Store3  -'                    -1                       1->0
640     //  Load4                          0                       0
641     //
642     // NumUnsafeDependencesActive > 0 indicates this situation and in this case
643     // we just keep assigning to the same cyclic partition until
644     // NumUnsafeDependencesActive reaches 0.
645     const MemoryDepChecker &DepChecker = LAI->getDepChecker();
646     MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(),
647                                      *Dependences);
648 
649     int NumUnsafeDependencesActive = 0;
650     for (auto &InstDep : MID) {
651       Instruction *I = InstDep.Inst;
652       // We update NumUnsafeDependencesActive post-instruction, catch the
653       // start of a dependence directly via NumUnsafeDependencesStartOrEnd.
654       if (NumUnsafeDependencesActive ||
655           InstDep.NumUnsafeDependencesStartOrEnd > 0)
656         Partitions.addToCyclicPartition(I);
657       else
658         Partitions.addToNewNonCyclicPartition(I);
659       NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd;
660       assert(NumUnsafeDependencesActive >= 0 &&
661              "Negative number of dependences active");
662     }
663 
664     // Add partitions for values used outside.  These partitions can be out of
665     // order from the original program order.  This is OK because if the
666     // partition uses a load we will merge this partition with the original
667     // partition of the load that we set up in the previous loop (see
668     // mergeToAvoidDuplicatedLoads).
669     auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L);
670     for (auto *Inst : DefsUsedOutside)
671       Partitions.addToNewNonCyclicPartition(Inst);
672 
673     DEBUG(dbgs() << "Seeded partitions:\n" << Partitions);
674     if (Partitions.getSize() < 2)
675       return fail("cannot isolate unsafe dependencies");
676 
677     // Run the merge heuristics: Merge non-cyclic adjacent partitions since we
678     // should be able to vectorize these together.
679     Partitions.mergeBeforePopulating();
680     DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions);
681     if (Partitions.getSize() < 2)
682       return fail("cannot isolate unsafe dependencies");
683 
684     // Now, populate the partitions with non-memory operations.
685     Partitions.populateUsedSet();
686     DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions);
687 
688     // In order to preserve original lexical order for loads, keep them in the
689     // partition that we set up in the MemoryInstructionDependences loop.
690     if (Partitions.mergeToAvoidDuplicatedLoads()) {
691       DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n"
692                    << Partitions);
693       if (Partitions.getSize() < 2)
694         return fail("cannot isolate unsafe dependencies");
695     }
696 
697     // Don't distribute the loop if we need too many SCEV run-time checks.
698     const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate();
699     if (Pred.getComplexity() > (IsForced.getValueOr(false)
700                                     ? PragmaDistributeSCEVCheckThreshold
701                                     : DistributeSCEVCheckThreshold))
702       return fail("too many SCEV run-time checks needed.\n");
703 
704     DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n");
705     // We're done forming the partitions set up the reverse mapping from
706     // instructions to partitions.
707     Partitions.setupPartitionIdOnInstructions();
708 
709     // To keep things simple have an empty preheader before we version or clone
710     // the loop.  (Also split if this has no predecessor, i.e. entry, because we
711     // rely on PH having a predecessor.)
712     if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator())
713       SplitBlock(PH, PH->getTerminator(), DT, LI);
714 
715     // If we need run-time checks, version the loop now.
716     auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI);
717     const auto *RtPtrChecking = LAI->getRuntimePointerChecking();
718     const auto &AllChecks = RtPtrChecking->getChecks();
719     auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition,
720                                                   RtPtrChecking);
721 
722     if (!Pred.isAlwaysTrue() || !Checks.empty()) {
723       DEBUG(dbgs() << "\nPointers:\n");
724       DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks));
725       LoopVersioning LVer(*LAI, L, LI, DT, SE, false);
726       LVer.setAliasChecks(std::move(Checks));
727       LVer.setSCEVChecks(LAI->getPSE().getUnionPredicate());
728       LVer.versionLoop(DefsUsedOutside);
729       LVer.annotateLoopWithNoAlias();
730     }
731 
732     // Create identical copies of the original loop for each partition and hook
733     // them up sequentially.
734     Partitions.cloneLoops();
735 
736     // Now, we remove the instruction from each loop that don't belong to that
737     // partition.
738     Partitions.removeUnusedInsts();
739     DEBUG(dbgs() << "\nAfter removing unused Instrs:\n");
740     DEBUG(Partitions.printBlocks());
741 
742     if (LDistVerify) {
743       LI->verify();
744       DT->verifyDomTree();
745     }
746 
747     ++NumLoopsDistributed;
748     // Report the success.
749     emitOptimizationRemark(F->getContext(), LDIST_NAME, *F, L->getStartLoc(),
750                            "distributed loop");
751     return true;
752   }
753 
754   /// \brief Provide diagnostics then \return with false.
755   bool fail(llvm::StringRef Message) {
756     LLVMContext &Ctx = F->getContext();
757     bool Forced = isForced().getValueOr(false);
758 
759     DEBUG(dbgs() << "Skipping; " << Message << "\n");
760 
761     // With Rpass-missed report that distribution failed.
762     ORE->emitOptimizationRemarkMissed(
763         LDIST_NAME, L,
764         "loop not distributed: use -Rpass-analysis=loop-distribute for more "
765         "info");
766 
767     // With Rpass-analysis report why.  This is on by default if distribution
768     // was requested explicitly.
769     emitOptimizationRemarkAnalysis(
770         Ctx, Forced ? DiagnosticInfoOptimizationRemarkAnalysis::AlwaysPrint
771                     : LDIST_NAME,
772         *F, L->getStartLoc(), Twine("loop not distributed: ") + Message);
773 
774     // Also issue a warning if distribution was requested explicitly but it
775     // failed.
776     if (Forced)
777       Ctx.diagnose(DiagnosticInfoOptimizationFailure(
778           *F, L->getStartLoc(), "loop not distributed: failed "
779                                 "explicitly specified loop distribution"));
780 
781     return false;
782   }
783 
784   /// \brief Return if distribution forced to be enabled/disabled for the loop.
785   ///
786   /// If the optional has a value, it indicates whether distribution was forced
787   /// to be enabled (true) or disabled (false).  If the optional has no value
788   /// distribution was not forced either way.
789   const Optional<bool> &isForced() const { return IsForced; }
790 
791 private:
792   /// \brief Filter out checks between pointers from the same partition.
793   ///
794   /// \p PtrToPartition contains the partition number for pointers.  Partition
795   /// number -1 means that the pointer is used in multiple partitions.  In this
796   /// case we can't safely omit the check.
797   SmallVector<RuntimePointerChecking::PointerCheck, 4>
798   includeOnlyCrossPartitionChecks(
799       const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks,
800       const SmallVectorImpl<int> &PtrToPartition,
801       const RuntimePointerChecking *RtPtrChecking) {
802     SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
803 
804     std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks),
805                  [&](const RuntimePointerChecking::PointerCheck &Check) {
806                    for (unsigned PtrIdx1 : Check.first->Members)
807                      for (unsigned PtrIdx2 : Check.second->Members)
808                        // Only include this check if there is a pair of pointers
809                        // that require checking and the pointers fall into
810                        // separate partitions.
811                        //
812                        // (Note that we already know at this point that the two
813                        // pointer groups need checking but it doesn't follow
814                        // that each pair of pointers within the two groups need
815                        // checking as well.
816                        //
817                        // In other words we don't want to include a check just
818                        // because there is a pair of pointers between the two
819                        // pointer groups that require checks and a different
820                        // pair whose pointers fall into different partitions.)
821                        if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
822                            !RuntimePointerChecking::arePointersInSamePartition(
823                                PtrToPartition, PtrIdx1, PtrIdx2))
824                          return true;
825                    return false;
826                  });
827 
828     return Checks;
829   }
830 
831   /// \brief Check whether the loop metadata is forcing distribution to be
832   /// enabled/disabled.
833   void setForced() {
834     Optional<const MDOperand *> Value =
835         findStringMetadataForLoop(L, "llvm.loop.distribute.enable");
836     if (!Value)
837       return;
838 
839     const MDOperand *Op = *Value;
840     assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
841     IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue();
842   }
843 
844   Loop *L;
845   Function *F;
846 
847   // Analyses used.
848   LoopInfo *LI;
849   const LoopAccessInfo *LAI;
850   DominatorTree *DT;
851   ScalarEvolution *SE;
852   OptimizationRemarkEmitter *ORE;
853 
854   /// \brief Indicates whether distribution is forced to be enabled/disabled for
855   /// the loop.
856   ///
857   /// If the optional has a value, it indicates whether distribution was forced
858   /// to be enabled (true) or disabled (false).  If the optional has no value
859   /// distribution was not forced either way.
860   Optional<bool> IsForced;
861 };
862 
863 /// \brief The pass class.
864 class LoopDistribute : public FunctionPass {
865 public:
866   /// \p ProcessAllLoopsByDefault specifies whether loop distribution should be
867   /// performed by default.  Pass -enable-loop-distribute={0,1} overrides this
868   /// default.  We use this to keep LoopDistribution off by default when invoked
869   /// from the optimization pipeline but on when invoked explicitly from opt.
870   LoopDistribute(bool ProcessAllLoopsByDefault = true)
871       : FunctionPass(ID), ProcessAllLoops(ProcessAllLoopsByDefault) {
872     // The default is set by the caller.
873     if (EnableLoopDistribute.getNumOccurrences() > 0)
874       ProcessAllLoops = EnableLoopDistribute;
875     initializeLoopDistributePass(*PassRegistry::getPassRegistry());
876   }
877 
878   bool runOnFunction(Function &F) override {
879     if (skipFunction(F))
880       return false;
881 
882     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
883     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
884     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
885     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
886     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
887 
888     // Build up a worklist of inner-loops to vectorize. This is necessary as the
889     // act of distributing a loop creates new loops and can invalidate iterators
890     // across the loops.
891     SmallVector<Loop *, 8> Worklist;
892 
893     for (Loop *TopLevelLoop : *LI)
894       for (Loop *L : depth_first(TopLevelLoop))
895         // We only handle inner-most loops.
896         if (L->empty())
897           Worklist.push_back(L);
898 
899     // Now walk the identified inner loops.
900     bool Changed = false;
901     for (Loop *L : Worklist) {
902       LoopDistributeForLoop LDL(L, &F, LI, DT, SE, ORE);
903 
904       // If distribution was forced for the specific loop to be
905       // enabled/disabled, follow that.  Otherwise use the global flag.
906       if (LDL.isForced().getValueOr(ProcessAllLoops))
907         Changed |= LDL.processLoop(LAA);
908     }
909 
910     // Process each loop nest in the function.
911     return Changed;
912   }
913 
914   void getAnalysisUsage(AnalysisUsage &AU) const override {
915     AU.addRequired<ScalarEvolutionWrapperPass>();
916     AU.addRequired<LoopInfoWrapperPass>();
917     AU.addPreserved<LoopInfoWrapperPass>();
918     AU.addRequired<LoopAccessLegacyAnalysis>();
919     AU.addRequired<DominatorTreeWrapperPass>();
920     AU.addPreserved<DominatorTreeWrapperPass>();
921     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
922   }
923 
924   static char ID;
925 
926 private:
927   /// \brief Whether distribution should be on in this function.  The per-loop
928   /// pragma can override this.
929   bool ProcessAllLoops;
930 };
931 } // anonymous namespace
932 
933 char LoopDistribute::ID;
934 static const char ldist_name[] = "Loop Distribition";
935 
936 INITIALIZE_PASS_BEGIN(LoopDistribute, LDIST_NAME, ldist_name, false, false)
937 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
938 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
939 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
940 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
941 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
942 INITIALIZE_PASS_END(LoopDistribute, LDIST_NAME, ldist_name, false, false)
943 
944 namespace llvm {
945 FunctionPass *createLoopDistributePass(bool ProcessAllLoopsByDefault) {
946   return new LoopDistribute(ProcessAllLoopsByDefault);
947 }
948 }
949