10b57cec5SDimitry Andric //===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric // 90b57cec5SDimitry Andric // This file implements the Loop Distribution Pass. Its main focus is to 100b57cec5SDimitry Andric // distribute loops that cannot be vectorized due to dependence cycles. It 110b57cec5SDimitry Andric // tries to isolate the offending dependences into a new loop allowing 120b57cec5SDimitry Andric // vectorization of the remaining parts. 130b57cec5SDimitry Andric // 140b57cec5SDimitry Andric // For dependence analysis, the pass uses the LoopVectorizer's 150b57cec5SDimitry Andric // LoopAccessAnalysis. Because this analysis presumes no change in the order of 160b57cec5SDimitry Andric // memory operations, special care is taken to preserve the lexical order of 170b57cec5SDimitry Andric // these operations. 180b57cec5SDimitry Andric // 190b57cec5SDimitry Andric // Similarly to the Vectorizer, the pass also supports loop versioning to 200b57cec5SDimitry Andric // run-time disambiguate potentially overlapping arrays. 210b57cec5SDimitry Andric // 220b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 230b57cec5SDimitry Andric 240b57cec5SDimitry Andric #include "llvm/Transforms/Scalar/LoopDistribute.h" 250b57cec5SDimitry Andric #include "llvm/ADT/DenseMap.h" 260b57cec5SDimitry Andric #include "llvm/ADT/DepthFirstIterator.h" 270b57cec5SDimitry Andric #include "llvm/ADT/EquivalenceClasses.h" 280b57cec5SDimitry Andric #include "llvm/ADT/STLExtras.h" 29*0fca6ea1SDimitry Andric #include "llvm/ADT/SetVector.h" 300b57cec5SDimitry Andric #include "llvm/ADT/SmallVector.h" 310b57cec5SDimitry Andric #include "llvm/ADT/Statistic.h" 320b57cec5SDimitry Andric #include "llvm/ADT/StringRef.h" 330b57cec5SDimitry Andric #include "llvm/ADT/Twine.h" 340b57cec5SDimitry Andric #include "llvm/ADT/iterator_range.h" 350b57cec5SDimitry Andric #include "llvm/Analysis/AssumptionCache.h" 360b57cec5SDimitry Andric #include "llvm/Analysis/GlobalsModRef.h" 370b57cec5SDimitry Andric #include "llvm/Analysis/LoopAccessAnalysis.h" 380b57cec5SDimitry Andric #include "llvm/Analysis/LoopAnalysisManager.h" 390b57cec5SDimitry Andric #include "llvm/Analysis/LoopInfo.h" 400b57cec5SDimitry Andric #include "llvm/Analysis/OptimizationRemarkEmitter.h" 410b57cec5SDimitry Andric #include "llvm/Analysis/ScalarEvolution.h" 420b57cec5SDimitry Andric #include "llvm/Analysis/TargetLibraryInfo.h" 430b57cec5SDimitry Andric #include "llvm/Analysis/TargetTransformInfo.h" 440b57cec5SDimitry Andric #include "llvm/IR/BasicBlock.h" 450b57cec5SDimitry Andric #include "llvm/IR/Constants.h" 460b57cec5SDimitry Andric #include "llvm/IR/DiagnosticInfo.h" 470b57cec5SDimitry Andric #include "llvm/IR/Dominators.h" 480b57cec5SDimitry Andric #include "llvm/IR/Function.h" 490b57cec5SDimitry Andric #include "llvm/IR/Instruction.h" 500b57cec5SDimitry Andric #include "llvm/IR/Instructions.h" 510b57cec5SDimitry Andric #include "llvm/IR/LLVMContext.h" 520b57cec5SDimitry Andric #include "llvm/IR/Metadata.h" 530b57cec5SDimitry Andric #include "llvm/IR/PassManager.h" 540b57cec5SDimitry Andric #include "llvm/IR/Value.h" 550b57cec5SDimitry Andric #include "llvm/Support/Casting.h" 560b57cec5SDimitry Andric #include "llvm/Support/CommandLine.h" 570b57cec5SDimitry Andric #include "llvm/Support/Debug.h" 580b57cec5SDimitry Andric #include "llvm/Support/raw_ostream.h" 590b57cec5SDimitry Andric #include "llvm/Transforms/Utils/BasicBlockUtils.h" 600b57cec5SDimitry Andric #include "llvm/Transforms/Utils/Cloning.h" 610b57cec5SDimitry Andric #include "llvm/Transforms/Utils/LoopUtils.h" 620b57cec5SDimitry Andric #include "llvm/Transforms/Utils/LoopVersioning.h" 630b57cec5SDimitry Andric #include "llvm/Transforms/Utils/ValueMapper.h" 640b57cec5SDimitry Andric #include <cassert> 650b57cec5SDimitry Andric #include <functional> 660b57cec5SDimitry Andric #include <list> 670b57cec5SDimitry Andric #include <tuple> 680b57cec5SDimitry Andric #include <utility> 690b57cec5SDimitry Andric 700b57cec5SDimitry Andric using namespace llvm; 710b57cec5SDimitry Andric 720b57cec5SDimitry Andric #define LDIST_NAME "loop-distribute" 730b57cec5SDimitry Andric #define DEBUG_TYPE LDIST_NAME 740b57cec5SDimitry Andric 750b57cec5SDimitry Andric /// @{ 760b57cec5SDimitry Andric /// Metadata attribute names 770b57cec5SDimitry Andric static const char *const LLVMLoopDistributeFollowupAll = 780b57cec5SDimitry Andric "llvm.loop.distribute.followup_all"; 790b57cec5SDimitry Andric static const char *const LLVMLoopDistributeFollowupCoincident = 800b57cec5SDimitry Andric "llvm.loop.distribute.followup_coincident"; 810b57cec5SDimitry Andric static const char *const LLVMLoopDistributeFollowupSequential = 820b57cec5SDimitry Andric "llvm.loop.distribute.followup_sequential"; 830b57cec5SDimitry Andric static const char *const LLVMLoopDistributeFollowupFallback = 840b57cec5SDimitry Andric "llvm.loop.distribute.followup_fallback"; 850b57cec5SDimitry Andric /// @} 860b57cec5SDimitry Andric 870b57cec5SDimitry Andric static cl::opt<bool> 880b57cec5SDimitry Andric LDistVerify("loop-distribute-verify", cl::Hidden, 890b57cec5SDimitry Andric cl::desc("Turn on DominatorTree and LoopInfo verification " 900b57cec5SDimitry Andric "after Loop Distribution"), 910b57cec5SDimitry Andric cl::init(false)); 920b57cec5SDimitry Andric 930b57cec5SDimitry Andric static cl::opt<bool> DistributeNonIfConvertible( 940b57cec5SDimitry Andric "loop-distribute-non-if-convertible", cl::Hidden, 950b57cec5SDimitry Andric cl::desc("Whether to distribute into a loop that may not be " 960b57cec5SDimitry Andric "if-convertible by the loop vectorizer"), 970b57cec5SDimitry Andric cl::init(false)); 980b57cec5SDimitry Andric 990b57cec5SDimitry Andric static cl::opt<unsigned> DistributeSCEVCheckThreshold( 1000b57cec5SDimitry Andric "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden, 1010b57cec5SDimitry Andric cl::desc("The maximum number of SCEV checks allowed for Loop " 1020b57cec5SDimitry Andric "Distribution")); 1030b57cec5SDimitry Andric 1040b57cec5SDimitry Andric static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold( 1050b57cec5SDimitry Andric "loop-distribute-scev-check-threshold-with-pragma", cl::init(128), 1060b57cec5SDimitry Andric cl::Hidden, 1075f757f3fSDimitry Andric cl::desc("The maximum number of SCEV checks allowed for Loop " 1085f757f3fSDimitry Andric "Distribution for loop marked with #pragma clang loop " 1095f757f3fSDimitry Andric "distribute(enable)")); 1100b57cec5SDimitry Andric 1110b57cec5SDimitry Andric static cl::opt<bool> EnableLoopDistribute( 1120b57cec5SDimitry Andric "enable-loop-distribute", cl::Hidden, 1130b57cec5SDimitry Andric cl::desc("Enable the new, experimental LoopDistribution Pass"), 1140b57cec5SDimitry Andric cl::init(false)); 1150b57cec5SDimitry Andric 1160b57cec5SDimitry Andric STATISTIC(NumLoopsDistributed, "Number of loops distributed"); 1170b57cec5SDimitry Andric 1180b57cec5SDimitry Andric namespace { 1190b57cec5SDimitry Andric 1200b57cec5SDimitry Andric /// Maintains the set of instructions of the loop for a partition before 1210b57cec5SDimitry Andric /// cloning. After cloning, it hosts the new loop. 1220b57cec5SDimitry Andric class InstPartition { 123*0fca6ea1SDimitry Andric using InstructionSet = SmallSetVector<Instruction *, 8>; 1240b57cec5SDimitry Andric 1250b57cec5SDimitry Andric public: 1260b57cec5SDimitry Andric InstPartition(Instruction *I, Loop *L, bool DepCycle = false) 1270b57cec5SDimitry Andric : DepCycle(DepCycle), OrigLoop(L) { 1280b57cec5SDimitry Andric Set.insert(I); 1290b57cec5SDimitry Andric } 1300b57cec5SDimitry Andric 1310b57cec5SDimitry Andric /// Returns whether this partition contains a dependence cycle. 1320b57cec5SDimitry Andric bool hasDepCycle() const { return DepCycle; } 1330b57cec5SDimitry Andric 1340b57cec5SDimitry Andric /// Adds an instruction to this partition. 1350b57cec5SDimitry Andric void add(Instruction *I) { Set.insert(I); } 1360b57cec5SDimitry Andric 1370b57cec5SDimitry Andric /// Collection accessors. 1380b57cec5SDimitry Andric InstructionSet::iterator begin() { return Set.begin(); } 1390b57cec5SDimitry Andric InstructionSet::iterator end() { return Set.end(); } 1400b57cec5SDimitry Andric InstructionSet::const_iterator begin() const { return Set.begin(); } 1410b57cec5SDimitry Andric InstructionSet::const_iterator end() const { return Set.end(); } 1420b57cec5SDimitry Andric bool empty() const { return Set.empty(); } 1430b57cec5SDimitry Andric 1440b57cec5SDimitry Andric /// Moves this partition into \p Other. This partition becomes empty 1450b57cec5SDimitry Andric /// after this. 1460b57cec5SDimitry Andric void moveTo(InstPartition &Other) { 1470b57cec5SDimitry Andric Other.Set.insert(Set.begin(), Set.end()); 1480b57cec5SDimitry Andric Set.clear(); 1490b57cec5SDimitry Andric Other.DepCycle |= DepCycle; 1500b57cec5SDimitry Andric } 1510b57cec5SDimitry Andric 1520b57cec5SDimitry Andric /// Populates the partition with a transitive closure of all the 1530b57cec5SDimitry Andric /// instructions that the seeded instructions dependent on. 1540b57cec5SDimitry Andric void populateUsedSet() { 1550b57cec5SDimitry Andric // FIXME: We currently don't use control-dependence but simply include all 1560b57cec5SDimitry Andric // blocks (possibly empty at the end) and let simplifycfg mostly clean this 1570b57cec5SDimitry Andric // up. 1580b57cec5SDimitry Andric for (auto *B : OrigLoop->getBlocks()) 1590b57cec5SDimitry Andric Set.insert(B->getTerminator()); 1600b57cec5SDimitry Andric 1610b57cec5SDimitry Andric // Follow the use-def chains to form a transitive closure of all the 1620b57cec5SDimitry Andric // instructions that the originally seeded instructions depend on. 1630b57cec5SDimitry Andric SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end()); 1640b57cec5SDimitry Andric while (!Worklist.empty()) { 1650b57cec5SDimitry Andric Instruction *I = Worklist.pop_back_val(); 1660b57cec5SDimitry Andric // Insert instructions from the loop that we depend on. 1670b57cec5SDimitry Andric for (Value *V : I->operand_values()) { 1680b57cec5SDimitry Andric auto *I = dyn_cast<Instruction>(V); 169*0fca6ea1SDimitry Andric if (I && OrigLoop->contains(I->getParent()) && Set.insert(I)) 1700b57cec5SDimitry Andric Worklist.push_back(I); 1710b57cec5SDimitry Andric } 1720b57cec5SDimitry Andric } 1730b57cec5SDimitry Andric } 1740b57cec5SDimitry Andric 1750b57cec5SDimitry Andric /// Clones the original loop. 1760b57cec5SDimitry Andric /// 1770b57cec5SDimitry Andric /// Updates LoopInfo and DominatorTree using the information that block \p 1780b57cec5SDimitry Andric /// LoopDomBB dominates the loop. 1790b57cec5SDimitry Andric Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB, 1800b57cec5SDimitry Andric unsigned Index, LoopInfo *LI, 1810b57cec5SDimitry Andric DominatorTree *DT) { 1820b57cec5SDimitry Andric ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop, 1830b57cec5SDimitry Andric VMap, Twine(".ldist") + Twine(Index), 1840b57cec5SDimitry Andric LI, DT, ClonedLoopBlocks); 1850b57cec5SDimitry Andric return ClonedLoop; 1860b57cec5SDimitry Andric } 1870b57cec5SDimitry Andric 1880b57cec5SDimitry Andric /// The cloned loop. If this partition is mapped to the original loop, 1890b57cec5SDimitry Andric /// this is null. 1900b57cec5SDimitry Andric const Loop *getClonedLoop() const { return ClonedLoop; } 1910b57cec5SDimitry Andric 1920b57cec5SDimitry Andric /// Returns the loop where this partition ends up after distribution. 1930b57cec5SDimitry Andric /// If this partition is mapped to the original loop then use the block from 1940b57cec5SDimitry Andric /// the loop. 1950b57cec5SDimitry Andric Loop *getDistributedLoop() const { 1960b57cec5SDimitry Andric return ClonedLoop ? ClonedLoop : OrigLoop; 1970b57cec5SDimitry Andric } 1980b57cec5SDimitry Andric 1990b57cec5SDimitry Andric /// The VMap that is populated by cloning and then used in 2000b57cec5SDimitry Andric /// remapinstruction to remap the cloned instructions. 2010b57cec5SDimitry Andric ValueToValueMapTy &getVMap() { return VMap; } 2020b57cec5SDimitry Andric 2030b57cec5SDimitry Andric /// Remaps the cloned instructions using VMap. 2040b57cec5SDimitry Andric void remapInstructions() { 2050b57cec5SDimitry Andric remapInstructionsInBlocks(ClonedLoopBlocks, VMap); 2060b57cec5SDimitry Andric } 2070b57cec5SDimitry Andric 2080b57cec5SDimitry Andric /// Based on the set of instructions selected for this partition, 2090b57cec5SDimitry Andric /// removes the unnecessary ones. 2100b57cec5SDimitry Andric void removeUnusedInsts() { 2110b57cec5SDimitry Andric SmallVector<Instruction *, 8> Unused; 2120b57cec5SDimitry Andric 2130b57cec5SDimitry Andric for (auto *Block : OrigLoop->getBlocks()) 2140b57cec5SDimitry Andric for (auto &Inst : *Block) 2150b57cec5SDimitry Andric if (!Set.count(&Inst)) { 2160b57cec5SDimitry Andric Instruction *NewInst = &Inst; 2170b57cec5SDimitry Andric if (!VMap.empty()) 2180b57cec5SDimitry Andric NewInst = cast<Instruction>(VMap[NewInst]); 2190b57cec5SDimitry Andric 2200b57cec5SDimitry Andric assert(!isa<BranchInst>(NewInst) && 2210b57cec5SDimitry Andric "Branches are marked used early on"); 2220b57cec5SDimitry Andric Unused.push_back(NewInst); 2230b57cec5SDimitry Andric } 2240b57cec5SDimitry Andric 2250b57cec5SDimitry Andric // Delete the instructions backwards, as it has a reduced likelihood of 2260b57cec5SDimitry Andric // having to update as many def-use and use-def chains. 2270b57cec5SDimitry Andric for (auto *Inst : reverse(Unused)) { 2280b57cec5SDimitry Andric if (!Inst->use_empty()) 22981ad6265SDimitry Andric Inst->replaceAllUsesWith(PoisonValue::get(Inst->getType())); 2300b57cec5SDimitry Andric Inst->eraseFromParent(); 2310b57cec5SDimitry Andric } 2320b57cec5SDimitry Andric } 2330b57cec5SDimitry Andric 234*0fca6ea1SDimitry Andric void print(raw_ostream &OS) const { 235*0fca6ea1SDimitry Andric OS << (DepCycle ? " (cycle)\n" : "\n"); 2360b57cec5SDimitry Andric for (auto *I : Set) 2370b57cec5SDimitry Andric // Prefix with the block name. 238*0fca6ea1SDimitry Andric OS << " " << I->getParent()->getName() << ":" << *I << "\n"; 2390b57cec5SDimitry Andric } 2400b57cec5SDimitry Andric 241*0fca6ea1SDimitry Andric void printBlocks(raw_ostream &OS) const { 2420b57cec5SDimitry Andric for (auto *BB : getDistributedLoop()->getBlocks()) 243*0fca6ea1SDimitry Andric OS << *BB; 2440b57cec5SDimitry Andric } 2450b57cec5SDimitry Andric 2460b57cec5SDimitry Andric private: 2470b57cec5SDimitry Andric /// Instructions from OrigLoop selected for this partition. 2480b57cec5SDimitry Andric InstructionSet Set; 2490b57cec5SDimitry Andric 2500b57cec5SDimitry Andric /// Whether this partition contains a dependence cycle. 2510b57cec5SDimitry Andric bool DepCycle; 2520b57cec5SDimitry Andric 2530b57cec5SDimitry Andric /// The original loop. 2540b57cec5SDimitry Andric Loop *OrigLoop; 2550b57cec5SDimitry Andric 2560b57cec5SDimitry Andric /// The cloned loop. If this partition is mapped to the original loop, 2570b57cec5SDimitry Andric /// this is null. 2580b57cec5SDimitry Andric Loop *ClonedLoop = nullptr; 2590b57cec5SDimitry Andric 2600b57cec5SDimitry Andric /// The blocks of ClonedLoop including the preheader. If this 2610b57cec5SDimitry Andric /// partition is mapped to the original loop, this is empty. 2620b57cec5SDimitry Andric SmallVector<BasicBlock *, 8> ClonedLoopBlocks; 2630b57cec5SDimitry Andric 2640b57cec5SDimitry Andric /// These gets populated once the set of instructions have been 2650b57cec5SDimitry Andric /// finalized. If this partition is mapped to the original loop, these are not 2660b57cec5SDimitry Andric /// set. 2670b57cec5SDimitry Andric ValueToValueMapTy VMap; 2680b57cec5SDimitry Andric }; 2690b57cec5SDimitry Andric 2700b57cec5SDimitry Andric /// Holds the set of Partitions. It populates them, merges them and then 2710b57cec5SDimitry Andric /// clones the loops. 2720b57cec5SDimitry Andric class InstPartitionContainer { 2730b57cec5SDimitry Andric using InstToPartitionIdT = DenseMap<Instruction *, int>; 2740b57cec5SDimitry Andric 2750b57cec5SDimitry Andric public: 2760b57cec5SDimitry Andric InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT) 2770b57cec5SDimitry Andric : L(L), LI(LI), DT(DT) {} 2780b57cec5SDimitry Andric 2790b57cec5SDimitry Andric /// Returns the number of partitions. 2800b57cec5SDimitry Andric unsigned getSize() const { return PartitionContainer.size(); } 2810b57cec5SDimitry Andric 2820b57cec5SDimitry Andric /// Adds \p Inst into the current partition if that is marked to 2830b57cec5SDimitry Andric /// contain cycles. Otherwise start a new partition for it. 2840b57cec5SDimitry Andric void addToCyclicPartition(Instruction *Inst) { 2850b57cec5SDimitry Andric // If the current partition is non-cyclic. Start a new one. 2860b57cec5SDimitry Andric if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle()) 2870b57cec5SDimitry Andric PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true); 2880b57cec5SDimitry Andric else 2890b57cec5SDimitry Andric PartitionContainer.back().add(Inst); 2900b57cec5SDimitry Andric } 2910b57cec5SDimitry Andric 2920b57cec5SDimitry Andric /// Adds \p Inst into a partition that is not marked to contain 2930b57cec5SDimitry Andric /// dependence cycles. 2940b57cec5SDimitry Andric /// 2950b57cec5SDimitry Andric // Initially we isolate memory instructions into as many partitions as 2960b57cec5SDimitry Andric // possible, then later we may merge them back together. 2970b57cec5SDimitry Andric void addToNewNonCyclicPartition(Instruction *Inst) { 2980b57cec5SDimitry Andric PartitionContainer.emplace_back(Inst, L); 2990b57cec5SDimitry Andric } 3000b57cec5SDimitry Andric 3010b57cec5SDimitry Andric /// Merges adjacent non-cyclic partitions. 3020b57cec5SDimitry Andric /// 3030b57cec5SDimitry Andric /// The idea is that we currently only want to isolate the non-vectorizable 3040b57cec5SDimitry Andric /// partition. We could later allow more distribution among these partition 3050b57cec5SDimitry Andric /// too. 3060b57cec5SDimitry Andric void mergeAdjacentNonCyclic() { 3070b57cec5SDimitry Andric mergeAdjacentPartitionsIf( 3080b57cec5SDimitry Andric [](const InstPartition *P) { return !P->hasDepCycle(); }); 3090b57cec5SDimitry Andric } 3100b57cec5SDimitry Andric 3110b57cec5SDimitry Andric /// If a partition contains only conditional stores, we won't vectorize 3120b57cec5SDimitry Andric /// it. Try to merge it with a previous cyclic partition. 3130b57cec5SDimitry Andric void mergeNonIfConvertible() { 3140b57cec5SDimitry Andric mergeAdjacentPartitionsIf([&](const InstPartition *Partition) { 3150b57cec5SDimitry Andric if (Partition->hasDepCycle()) 3160b57cec5SDimitry Andric return true; 3170b57cec5SDimitry Andric 3180b57cec5SDimitry Andric // Now, check if all stores are conditional in this partition. 3190b57cec5SDimitry Andric bool seenStore = false; 3200b57cec5SDimitry Andric 3210b57cec5SDimitry Andric for (auto *Inst : *Partition) 3220b57cec5SDimitry Andric if (isa<StoreInst>(Inst)) { 3230b57cec5SDimitry Andric seenStore = true; 3240b57cec5SDimitry Andric if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT)) 3250b57cec5SDimitry Andric return false; 3260b57cec5SDimitry Andric } 3270b57cec5SDimitry Andric return seenStore; 3280b57cec5SDimitry Andric }); 3290b57cec5SDimitry Andric } 3300b57cec5SDimitry Andric 3310b57cec5SDimitry Andric /// Merges the partitions according to various heuristics. 3320b57cec5SDimitry Andric void mergeBeforePopulating() { 3330b57cec5SDimitry Andric mergeAdjacentNonCyclic(); 3340b57cec5SDimitry Andric if (!DistributeNonIfConvertible) 3350b57cec5SDimitry Andric mergeNonIfConvertible(); 3360b57cec5SDimitry Andric } 3370b57cec5SDimitry Andric 3380b57cec5SDimitry Andric /// Merges partitions in order to ensure that no loads are duplicated. 3390b57cec5SDimitry Andric /// 3400b57cec5SDimitry Andric /// We can't duplicate loads because that could potentially reorder them. 3410b57cec5SDimitry Andric /// LoopAccessAnalysis provides dependency information with the context that 3420b57cec5SDimitry Andric /// the order of memory operation is preserved. 3430b57cec5SDimitry Andric /// 3440b57cec5SDimitry Andric /// Return if any partitions were merged. 3450b57cec5SDimitry Andric bool mergeToAvoidDuplicatedLoads() { 3460b57cec5SDimitry Andric using LoadToPartitionT = DenseMap<Instruction *, InstPartition *>; 3470b57cec5SDimitry Andric using ToBeMergedT = EquivalenceClasses<InstPartition *>; 3480b57cec5SDimitry Andric 3490b57cec5SDimitry Andric LoadToPartitionT LoadToPartition; 3500b57cec5SDimitry Andric ToBeMergedT ToBeMerged; 3510b57cec5SDimitry Andric 3520b57cec5SDimitry Andric // Step through the partitions and create equivalence between partitions 3530b57cec5SDimitry Andric // that contain the same load. Also put partitions in between them in the 3540b57cec5SDimitry Andric // same equivalence class to avoid reordering of memory operations. 3550b57cec5SDimitry Andric for (PartitionContainerT::iterator I = PartitionContainer.begin(), 3560b57cec5SDimitry Andric E = PartitionContainer.end(); 3570b57cec5SDimitry Andric I != E; ++I) { 3580b57cec5SDimitry Andric auto *PartI = &*I; 3590b57cec5SDimitry Andric 3600b57cec5SDimitry Andric // If a load occurs in two partitions PartI and PartJ, merge all 3610b57cec5SDimitry Andric // partitions (PartI, PartJ] into PartI. 3620b57cec5SDimitry Andric for (Instruction *Inst : *PartI) 3630b57cec5SDimitry Andric if (isa<LoadInst>(Inst)) { 3640b57cec5SDimitry Andric bool NewElt; 3650b57cec5SDimitry Andric LoadToPartitionT::iterator LoadToPart; 3660b57cec5SDimitry Andric 3670b57cec5SDimitry Andric std::tie(LoadToPart, NewElt) = 3680b57cec5SDimitry Andric LoadToPartition.insert(std::make_pair(Inst, PartI)); 3690b57cec5SDimitry Andric if (!NewElt) { 370*0fca6ea1SDimitry Andric LLVM_DEBUG( 371*0fca6ea1SDimitry Andric dbgs() 372*0fca6ea1SDimitry Andric << "LDist: Merging partitions due to this load in multiple " 373*0fca6ea1SDimitry Andric << "partitions: " << PartI << ", " << LoadToPart->second << "\n" 3740b57cec5SDimitry Andric << *Inst << "\n"); 3750b57cec5SDimitry Andric 3760b57cec5SDimitry Andric auto PartJ = I; 3770b57cec5SDimitry Andric do { 3780b57cec5SDimitry Andric --PartJ; 3790b57cec5SDimitry Andric ToBeMerged.unionSets(PartI, &*PartJ); 3800b57cec5SDimitry Andric } while (&*PartJ != LoadToPart->second); 3810b57cec5SDimitry Andric } 3820b57cec5SDimitry Andric } 3830b57cec5SDimitry Andric } 3840b57cec5SDimitry Andric if (ToBeMerged.empty()) 3850b57cec5SDimitry Andric return false; 3860b57cec5SDimitry Andric 3870b57cec5SDimitry Andric // Merge the member of an equivalence class into its class leader. This 3880b57cec5SDimitry Andric // makes the members empty. 3890b57cec5SDimitry Andric for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end(); 3900b57cec5SDimitry Andric I != E; ++I) { 3910b57cec5SDimitry Andric if (!I->isLeader()) 3920b57cec5SDimitry Andric continue; 3930b57cec5SDimitry Andric 3940b57cec5SDimitry Andric auto PartI = I->getData(); 395bdd1243dSDimitry Andric for (auto *PartJ : make_range(std::next(ToBeMerged.member_begin(I)), 3960b57cec5SDimitry Andric ToBeMerged.member_end())) { 3970b57cec5SDimitry Andric PartJ->moveTo(*PartI); 3980b57cec5SDimitry Andric } 3990b57cec5SDimitry Andric } 4000b57cec5SDimitry Andric 4010b57cec5SDimitry Andric // Remove the empty partitions. 4020b57cec5SDimitry Andric PartitionContainer.remove_if( 4030b57cec5SDimitry Andric [](const InstPartition &P) { return P.empty(); }); 4040b57cec5SDimitry Andric 4050b57cec5SDimitry Andric return true; 4060b57cec5SDimitry Andric } 4070b57cec5SDimitry Andric 4080b57cec5SDimitry Andric /// Sets up the mapping between instructions to partitions. If the 4090b57cec5SDimitry Andric /// instruction is duplicated across multiple partitions, set the entry to -1. 4100b57cec5SDimitry Andric void setupPartitionIdOnInstructions() { 4110b57cec5SDimitry Andric int PartitionID = 0; 4120b57cec5SDimitry Andric for (const auto &Partition : PartitionContainer) { 4130b57cec5SDimitry Andric for (Instruction *Inst : Partition) { 4140b57cec5SDimitry Andric bool NewElt; 4150b57cec5SDimitry Andric InstToPartitionIdT::iterator Iter; 4160b57cec5SDimitry Andric 4170b57cec5SDimitry Andric std::tie(Iter, NewElt) = 4180b57cec5SDimitry Andric InstToPartitionId.insert(std::make_pair(Inst, PartitionID)); 4190b57cec5SDimitry Andric if (!NewElt) 4200b57cec5SDimitry Andric Iter->second = -1; 4210b57cec5SDimitry Andric } 4220b57cec5SDimitry Andric ++PartitionID; 4230b57cec5SDimitry Andric } 4240b57cec5SDimitry Andric } 4250b57cec5SDimitry Andric 4260b57cec5SDimitry Andric /// Populates the partition with everything that the seeding 4270b57cec5SDimitry Andric /// instructions require. 4280b57cec5SDimitry Andric void populateUsedSet() { 4290b57cec5SDimitry Andric for (auto &P : PartitionContainer) 4300b57cec5SDimitry Andric P.populateUsedSet(); 4310b57cec5SDimitry Andric } 4320b57cec5SDimitry Andric 4330b57cec5SDimitry Andric /// This performs the main chunk of the work of cloning the loops for 4340b57cec5SDimitry Andric /// the partitions. 4350b57cec5SDimitry Andric void cloneLoops() { 4360b57cec5SDimitry Andric BasicBlock *OrigPH = L->getLoopPreheader(); 4370b57cec5SDimitry Andric // At this point the predecessor of the preheader is either the memcheck 4380b57cec5SDimitry Andric // block or the top part of the original preheader. 4390b57cec5SDimitry Andric BasicBlock *Pred = OrigPH->getSinglePredecessor(); 4400b57cec5SDimitry Andric assert(Pred && "Preheader does not have a single predecessor"); 4410b57cec5SDimitry Andric BasicBlock *ExitBlock = L->getExitBlock(); 4420b57cec5SDimitry Andric assert(ExitBlock && "No single exit block"); 4430b57cec5SDimitry Andric Loop *NewLoop; 4440b57cec5SDimitry Andric 4450b57cec5SDimitry Andric assert(!PartitionContainer.empty() && "at least two partitions expected"); 4460b57cec5SDimitry Andric // We're cloning the preheader along with the loop so we already made sure 4470b57cec5SDimitry Andric // it was empty. 4480b57cec5SDimitry Andric assert(&*OrigPH->begin() == OrigPH->getTerminator() && 4490b57cec5SDimitry Andric "preheader not empty"); 4500b57cec5SDimitry Andric 4510b57cec5SDimitry Andric // Preserve the original loop ID for use after the transformation. 4520b57cec5SDimitry Andric MDNode *OrigLoopID = L->getLoopID(); 4530b57cec5SDimitry Andric 4540b57cec5SDimitry Andric // Create a loop for each partition except the last. Clone the original 4550b57cec5SDimitry Andric // loop before PH along with adding a preheader for the cloned loop. Then 4560b57cec5SDimitry Andric // update PH to point to the newly added preheader. 4570b57cec5SDimitry Andric BasicBlock *TopPH = OrigPH; 4580b57cec5SDimitry Andric unsigned Index = getSize() - 1; 459bdd1243dSDimitry Andric for (auto &Part : llvm::drop_begin(llvm::reverse(PartitionContainer))) { 460bdd1243dSDimitry Andric NewLoop = Part.cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT); 4610b57cec5SDimitry Andric 462bdd1243dSDimitry Andric Part.getVMap()[ExitBlock] = TopPH; 463bdd1243dSDimitry Andric Part.remapInstructions(); 464bdd1243dSDimitry Andric setNewLoopID(OrigLoopID, &Part); 465bdd1243dSDimitry Andric --Index; 466bdd1243dSDimitry Andric TopPH = NewLoop->getLoopPreheader(); 4670b57cec5SDimitry Andric } 4680b57cec5SDimitry Andric Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH); 4690b57cec5SDimitry Andric 4700b57cec5SDimitry Andric // Also set a new loop ID for the last loop. 4710b57cec5SDimitry Andric setNewLoopID(OrigLoopID, &PartitionContainer.back()); 4720b57cec5SDimitry Andric 4730b57cec5SDimitry Andric // Now go in forward order and update the immediate dominator for the 4740b57cec5SDimitry Andric // preheaders with the exiting block of the previous loop. Dominance 4750b57cec5SDimitry Andric // within the loop is updated in cloneLoopWithPreheader. 4760b57cec5SDimitry Andric for (auto Curr = PartitionContainer.cbegin(), 4770b57cec5SDimitry Andric Next = std::next(PartitionContainer.cbegin()), 4780b57cec5SDimitry Andric E = PartitionContainer.cend(); 4790b57cec5SDimitry Andric Next != E; ++Curr, ++Next) 4800b57cec5SDimitry Andric DT->changeImmediateDominator( 4810b57cec5SDimitry Andric Next->getDistributedLoop()->getLoopPreheader(), 4820b57cec5SDimitry Andric Curr->getDistributedLoop()->getExitingBlock()); 4830b57cec5SDimitry Andric } 4840b57cec5SDimitry Andric 4850b57cec5SDimitry Andric /// Removes the dead instructions from the cloned loops. 4860b57cec5SDimitry Andric void removeUnusedInsts() { 4870b57cec5SDimitry Andric for (auto &Partition : PartitionContainer) 4880b57cec5SDimitry Andric Partition.removeUnusedInsts(); 4890b57cec5SDimitry Andric } 4900b57cec5SDimitry Andric 4910b57cec5SDimitry Andric /// For each memory pointer, it computes the partitionId the pointer is 4920b57cec5SDimitry Andric /// used in. 4930b57cec5SDimitry Andric /// 4940b57cec5SDimitry Andric /// This returns an array of int where the I-th entry corresponds to I-th 4950b57cec5SDimitry Andric /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple 4960b57cec5SDimitry Andric /// partitions its entry is set to -1. 4970b57cec5SDimitry Andric SmallVector<int, 8> 4980b57cec5SDimitry Andric computePartitionSetForPointers(const LoopAccessInfo &LAI) { 4990b57cec5SDimitry Andric const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking(); 5000b57cec5SDimitry Andric 5010b57cec5SDimitry Andric unsigned N = RtPtrCheck->Pointers.size(); 5020b57cec5SDimitry Andric SmallVector<int, 8> PtrToPartitions(N); 5030b57cec5SDimitry Andric for (unsigned I = 0; I < N; ++I) { 5040b57cec5SDimitry Andric Value *Ptr = RtPtrCheck->Pointers[I].PointerValue; 5050b57cec5SDimitry Andric auto Instructions = 5060b57cec5SDimitry Andric LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr); 5070b57cec5SDimitry Andric 5080b57cec5SDimitry Andric int &Partition = PtrToPartitions[I]; 5090b57cec5SDimitry Andric // First set it to uninitialized. 5100b57cec5SDimitry Andric Partition = -2; 5110b57cec5SDimitry Andric for (Instruction *Inst : Instructions) { 5120b57cec5SDimitry Andric // Note that this could be -1 if Inst is duplicated across multiple 5130b57cec5SDimitry Andric // partitions. 5140b57cec5SDimitry Andric int ThisPartition = this->InstToPartitionId[Inst]; 5150b57cec5SDimitry Andric if (Partition == -2) 5160b57cec5SDimitry Andric Partition = ThisPartition; 5170b57cec5SDimitry Andric // -1 means belonging to multiple partitions. 5180b57cec5SDimitry Andric else if (Partition == -1) 5190b57cec5SDimitry Andric break; 5200b57cec5SDimitry Andric else if (Partition != (int)ThisPartition) 5210b57cec5SDimitry Andric Partition = -1; 5220b57cec5SDimitry Andric } 5230b57cec5SDimitry Andric assert(Partition != -2 && "Pointer not belonging to any partition"); 5240b57cec5SDimitry Andric } 5250b57cec5SDimitry Andric 5260b57cec5SDimitry Andric return PtrToPartitions; 5270b57cec5SDimitry Andric } 5280b57cec5SDimitry Andric 5290b57cec5SDimitry Andric void print(raw_ostream &OS) const { 5300b57cec5SDimitry Andric unsigned Index = 0; 5310b57cec5SDimitry Andric for (const auto &P : PartitionContainer) { 532*0fca6ea1SDimitry Andric OS << "LDist: Partition " << Index++ << ":"; 533*0fca6ea1SDimitry Andric P.print(OS); 5340b57cec5SDimitry Andric } 5350b57cec5SDimitry Andric } 5360b57cec5SDimitry Andric 5370b57cec5SDimitry Andric void dump() const { print(dbgs()); } 5380b57cec5SDimitry Andric 5390b57cec5SDimitry Andric #ifndef NDEBUG 5400b57cec5SDimitry Andric friend raw_ostream &operator<<(raw_ostream &OS, 5410b57cec5SDimitry Andric const InstPartitionContainer &Partitions) { 5420b57cec5SDimitry Andric Partitions.print(OS); 5430b57cec5SDimitry Andric return OS; 5440b57cec5SDimitry Andric } 5450b57cec5SDimitry Andric #endif 5460b57cec5SDimitry Andric 547*0fca6ea1SDimitry Andric void printBlocks(raw_ostream &OS) const { 5480b57cec5SDimitry Andric unsigned Index = 0; 5490b57cec5SDimitry Andric for (const auto &P : PartitionContainer) { 550*0fca6ea1SDimitry Andric OS << "LDist: Partition " << Index++ << ":"; 551*0fca6ea1SDimitry Andric P.printBlocks(OS); 5520b57cec5SDimitry Andric } 5530b57cec5SDimitry Andric } 5540b57cec5SDimitry Andric 5550b57cec5SDimitry Andric private: 5560b57cec5SDimitry Andric using PartitionContainerT = std::list<InstPartition>; 5570b57cec5SDimitry Andric 5580b57cec5SDimitry Andric /// List of partitions. 5590b57cec5SDimitry Andric PartitionContainerT PartitionContainer; 5600b57cec5SDimitry Andric 5610b57cec5SDimitry Andric /// Mapping from Instruction to partition Id. If the instruction 5620b57cec5SDimitry Andric /// belongs to multiple partitions the entry contains -1. 5630b57cec5SDimitry Andric InstToPartitionIdT InstToPartitionId; 5640b57cec5SDimitry Andric 5650b57cec5SDimitry Andric Loop *L; 5660b57cec5SDimitry Andric LoopInfo *LI; 5670b57cec5SDimitry Andric DominatorTree *DT; 5680b57cec5SDimitry Andric 5690b57cec5SDimitry Andric /// The control structure to merge adjacent partitions if both satisfy 5700b57cec5SDimitry Andric /// the \p Predicate. 5710b57cec5SDimitry Andric template <class UnaryPredicate> 5720b57cec5SDimitry Andric void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) { 5730b57cec5SDimitry Andric InstPartition *PrevMatch = nullptr; 5740b57cec5SDimitry Andric for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) { 5750b57cec5SDimitry Andric auto DoesMatch = Predicate(&*I); 5760b57cec5SDimitry Andric if (PrevMatch == nullptr && DoesMatch) { 5770b57cec5SDimitry Andric PrevMatch = &*I; 5780b57cec5SDimitry Andric ++I; 5790b57cec5SDimitry Andric } else if (PrevMatch != nullptr && DoesMatch) { 5800b57cec5SDimitry Andric I->moveTo(*PrevMatch); 5810b57cec5SDimitry Andric I = PartitionContainer.erase(I); 5820b57cec5SDimitry Andric } else { 5830b57cec5SDimitry Andric PrevMatch = nullptr; 5840b57cec5SDimitry Andric ++I; 5850b57cec5SDimitry Andric } 5860b57cec5SDimitry Andric } 5870b57cec5SDimitry Andric } 5880b57cec5SDimitry Andric 5890b57cec5SDimitry Andric /// Assign new LoopIDs for the partition's cloned loop. 5900b57cec5SDimitry Andric void setNewLoopID(MDNode *OrigLoopID, InstPartition *Part) { 591bdd1243dSDimitry Andric std::optional<MDNode *> PartitionID = makeFollowupLoopID( 5920b57cec5SDimitry Andric OrigLoopID, 5930b57cec5SDimitry Andric {LLVMLoopDistributeFollowupAll, 5940b57cec5SDimitry Andric Part->hasDepCycle() ? LLVMLoopDistributeFollowupSequential 5950b57cec5SDimitry Andric : LLVMLoopDistributeFollowupCoincident}); 59681ad6265SDimitry Andric if (PartitionID) { 5970b57cec5SDimitry Andric Loop *NewLoop = Part->getDistributedLoop(); 598bdd1243dSDimitry Andric NewLoop->setLoopID(*PartitionID); 5990b57cec5SDimitry Andric } 6000b57cec5SDimitry Andric } 6010b57cec5SDimitry Andric }; 6020b57cec5SDimitry Andric 6030b57cec5SDimitry Andric /// For each memory instruction, this class maintains difference of the 6040b57cec5SDimitry Andric /// number of unsafe dependences that start out from this instruction minus 6050b57cec5SDimitry Andric /// those that end here. 6060b57cec5SDimitry Andric /// 6070b57cec5SDimitry Andric /// By traversing the memory instructions in program order and accumulating this 6080b57cec5SDimitry Andric /// number, we know whether any unsafe dependence crosses over a program point. 6090b57cec5SDimitry Andric class MemoryInstructionDependences { 6100b57cec5SDimitry Andric using Dependence = MemoryDepChecker::Dependence; 6110b57cec5SDimitry Andric 6120b57cec5SDimitry Andric public: 6130b57cec5SDimitry Andric struct Entry { 6140b57cec5SDimitry Andric Instruction *Inst; 6150b57cec5SDimitry Andric unsigned NumUnsafeDependencesStartOrEnd = 0; 6160b57cec5SDimitry Andric 6170b57cec5SDimitry Andric Entry(Instruction *Inst) : Inst(Inst) {} 6180b57cec5SDimitry Andric }; 6190b57cec5SDimitry Andric 6200b57cec5SDimitry Andric using AccessesType = SmallVector<Entry, 8>; 6210b57cec5SDimitry Andric 6220b57cec5SDimitry Andric AccessesType::const_iterator begin() const { return Accesses.begin(); } 6230b57cec5SDimitry Andric AccessesType::const_iterator end() const { return Accesses.end(); } 6240b57cec5SDimitry Andric 6250b57cec5SDimitry Andric MemoryInstructionDependences( 6260b57cec5SDimitry Andric const SmallVectorImpl<Instruction *> &Instructions, 6270b57cec5SDimitry Andric const SmallVectorImpl<Dependence> &Dependences) { 6280b57cec5SDimitry Andric Accesses.append(Instructions.begin(), Instructions.end()); 6290b57cec5SDimitry Andric 630*0fca6ea1SDimitry Andric LLVM_DEBUG(dbgs() << "LDist: Backward dependences:\n"); 631bdd1243dSDimitry Andric for (const auto &Dep : Dependences) 6320b57cec5SDimitry Andric if (Dep.isPossiblyBackward()) { 6330b57cec5SDimitry Andric // Note that the designations source and destination follow the program 6340b57cec5SDimitry Andric // order, i.e. source is always first. (The direction is given by the 6350b57cec5SDimitry Andric // DepType.) 6360b57cec5SDimitry Andric ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd; 6370b57cec5SDimitry Andric --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd; 6380b57cec5SDimitry Andric 6390b57cec5SDimitry Andric LLVM_DEBUG(Dep.print(dbgs(), 2, Instructions)); 6400b57cec5SDimitry Andric } 6410b57cec5SDimitry Andric } 6420b57cec5SDimitry Andric 6430b57cec5SDimitry Andric private: 6440b57cec5SDimitry Andric AccessesType Accesses; 6450b57cec5SDimitry Andric }; 6460b57cec5SDimitry Andric 6470b57cec5SDimitry Andric /// The actual class performing the per-loop work. 6480b57cec5SDimitry Andric class LoopDistributeForLoop { 6490b57cec5SDimitry Andric public: 6500b57cec5SDimitry Andric LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT, 651bdd1243dSDimitry Andric ScalarEvolution *SE, LoopAccessInfoManager &LAIs, 652bdd1243dSDimitry Andric OptimizationRemarkEmitter *ORE) 653bdd1243dSDimitry Andric : L(L), F(F), LI(LI), DT(DT), SE(SE), LAIs(LAIs), ORE(ORE) { 6540b57cec5SDimitry Andric setForced(); 6550b57cec5SDimitry Andric } 6560b57cec5SDimitry Andric 6570b57cec5SDimitry Andric /// Try to distribute an inner-most loop. 658bdd1243dSDimitry Andric bool processLoop() { 659e8d8bef9SDimitry Andric assert(L->isInnermost() && "Only process inner loops."); 6600b57cec5SDimitry Andric 661*0fca6ea1SDimitry Andric LLVM_DEBUG(dbgs() << "\nLDist: Checking a loop in '" 662*0fca6ea1SDimitry Andric << L->getHeader()->getParent()->getName() << "' from " 663*0fca6ea1SDimitry Andric << L->getLocStr() << "\n"); 6640b57cec5SDimitry Andric 665e8d8bef9SDimitry Andric // Having a single exit block implies there's also one exiting block. 6660b57cec5SDimitry Andric if (!L->getExitBlock()) 6670b57cec5SDimitry Andric return fail("MultipleExitBlocks", "multiple exit blocks"); 6680b57cec5SDimitry Andric if (!L->isLoopSimplifyForm()) 6690b57cec5SDimitry Andric return fail("NotLoopSimplifyForm", 6700b57cec5SDimitry Andric "loop is not in loop-simplify form"); 671e8d8bef9SDimitry Andric if (!L->isRotatedForm()) 672e8d8bef9SDimitry Andric return fail("NotBottomTested", "loop is not bottom tested"); 6730b57cec5SDimitry Andric 6740b57cec5SDimitry Andric BasicBlock *PH = L->getLoopPreheader(); 6750b57cec5SDimitry Andric 676bdd1243dSDimitry Andric LAI = &LAIs.getInfo(*L); 6770b57cec5SDimitry Andric 6780b57cec5SDimitry Andric // Currently, we only distribute to isolate the part of the loop with 6790b57cec5SDimitry Andric // dependence cycles to enable partial vectorization. 6800b57cec5SDimitry Andric if (LAI->canVectorizeMemory()) 6810b57cec5SDimitry Andric return fail("MemOpsCanBeVectorized", 6820b57cec5SDimitry Andric "memory operations are safe for vectorization"); 6830b57cec5SDimitry Andric 6840b57cec5SDimitry Andric auto *Dependences = LAI->getDepChecker().getDependences(); 6850b57cec5SDimitry Andric if (!Dependences || Dependences->empty()) 6860b57cec5SDimitry Andric return fail("NoUnsafeDeps", "no unsafe dependences to isolate"); 6870b57cec5SDimitry Andric 688*0fca6ea1SDimitry Andric LLVM_DEBUG(dbgs() << "LDist: Found a candidate loop: " 689*0fca6ea1SDimitry Andric << L->getHeader()->getName() << "\n"); 690*0fca6ea1SDimitry Andric 6910b57cec5SDimitry Andric InstPartitionContainer Partitions(L, LI, DT); 6920b57cec5SDimitry Andric 6930b57cec5SDimitry Andric // First, go through each memory operation and assign them to consecutive 6940b57cec5SDimitry Andric // partitions (the order of partitions follows program order). Put those 6950b57cec5SDimitry Andric // with unsafe dependences into "cyclic" partition otherwise put each store 6960b57cec5SDimitry Andric // in its own "non-cyclic" partition (we'll merge these later). 6970b57cec5SDimitry Andric // 6980b57cec5SDimitry Andric // Note that a memory operation (e.g. Load2 below) at a program point that 6990b57cec5SDimitry Andric // has an unsafe dependence (Store3->Load1) spanning over it must be 7000b57cec5SDimitry Andric // included in the same cyclic partition as the dependent operations. This 7010b57cec5SDimitry Andric // is to preserve the original program order after distribution. E.g.: 7020b57cec5SDimitry Andric // 7030b57cec5SDimitry Andric // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive 7040b57cec5SDimitry Andric // Load1 -. 1 0->1 7050b57cec5SDimitry Andric // Load2 | /Unsafe/ 0 1 7060b57cec5SDimitry Andric // Store3 -' -1 1->0 7070b57cec5SDimitry Andric // Load4 0 0 7080b57cec5SDimitry Andric // 7090b57cec5SDimitry Andric // NumUnsafeDependencesActive > 0 indicates this situation and in this case 7100b57cec5SDimitry Andric // we just keep assigning to the same cyclic partition until 7110b57cec5SDimitry Andric // NumUnsafeDependencesActive reaches 0. 7120b57cec5SDimitry Andric const MemoryDepChecker &DepChecker = LAI->getDepChecker(); 7130b57cec5SDimitry Andric MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(), 7140b57cec5SDimitry Andric *Dependences); 7150b57cec5SDimitry Andric 7160b57cec5SDimitry Andric int NumUnsafeDependencesActive = 0; 717bdd1243dSDimitry Andric for (const auto &InstDep : MID) { 7180b57cec5SDimitry Andric Instruction *I = InstDep.Inst; 7190b57cec5SDimitry Andric // We update NumUnsafeDependencesActive post-instruction, catch the 7200b57cec5SDimitry Andric // start of a dependence directly via NumUnsafeDependencesStartOrEnd. 7210b57cec5SDimitry Andric if (NumUnsafeDependencesActive || 7220b57cec5SDimitry Andric InstDep.NumUnsafeDependencesStartOrEnd > 0) 7230b57cec5SDimitry Andric Partitions.addToCyclicPartition(I); 7240b57cec5SDimitry Andric else 7250b57cec5SDimitry Andric Partitions.addToNewNonCyclicPartition(I); 7260b57cec5SDimitry Andric NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd; 7270b57cec5SDimitry Andric assert(NumUnsafeDependencesActive >= 0 && 7280b57cec5SDimitry Andric "Negative number of dependences active"); 7290b57cec5SDimitry Andric } 7300b57cec5SDimitry Andric 7310b57cec5SDimitry Andric // Add partitions for values used outside. These partitions can be out of 7320b57cec5SDimitry Andric // order from the original program order. This is OK because if the 7330b57cec5SDimitry Andric // partition uses a load we will merge this partition with the original 7340b57cec5SDimitry Andric // partition of the load that we set up in the previous loop (see 7350b57cec5SDimitry Andric // mergeToAvoidDuplicatedLoads). 7360b57cec5SDimitry Andric auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L); 7370b57cec5SDimitry Andric for (auto *Inst : DefsUsedOutside) 7380b57cec5SDimitry Andric Partitions.addToNewNonCyclicPartition(Inst); 7390b57cec5SDimitry Andric 740*0fca6ea1SDimitry Andric LLVM_DEBUG(dbgs() << "LDist: Seeded partitions:\n" << Partitions); 7410b57cec5SDimitry Andric if (Partitions.getSize() < 2) 7420b57cec5SDimitry Andric return fail("CantIsolateUnsafeDeps", 7430b57cec5SDimitry Andric "cannot isolate unsafe dependencies"); 7440b57cec5SDimitry Andric 7450b57cec5SDimitry Andric // Run the merge heuristics: Merge non-cyclic adjacent partitions since we 7460b57cec5SDimitry Andric // should be able to vectorize these together. 7470b57cec5SDimitry Andric Partitions.mergeBeforePopulating(); 748*0fca6ea1SDimitry Andric LLVM_DEBUG(dbgs() << "LDist: Merged partitions:\n" << Partitions); 7490b57cec5SDimitry Andric if (Partitions.getSize() < 2) 7500b57cec5SDimitry Andric return fail("CantIsolateUnsafeDeps", 7510b57cec5SDimitry Andric "cannot isolate unsafe dependencies"); 7520b57cec5SDimitry Andric 7530b57cec5SDimitry Andric // Now, populate the partitions with non-memory operations. 7540b57cec5SDimitry Andric Partitions.populateUsedSet(); 755*0fca6ea1SDimitry Andric LLVM_DEBUG(dbgs() << "LDist: Populated partitions:\n" << Partitions); 7560b57cec5SDimitry Andric 7570b57cec5SDimitry Andric // In order to preserve original lexical order for loads, keep them in the 7580b57cec5SDimitry Andric // partition that we set up in the MemoryInstructionDependences loop. 7590b57cec5SDimitry Andric if (Partitions.mergeToAvoidDuplicatedLoads()) { 760*0fca6ea1SDimitry Andric LLVM_DEBUG(dbgs() << "LDist: Partitions merged to ensure unique loads:\n" 7610b57cec5SDimitry Andric << Partitions); 7620b57cec5SDimitry Andric if (Partitions.getSize() < 2) 7630b57cec5SDimitry Andric return fail("CantIsolateUnsafeDeps", 7640b57cec5SDimitry Andric "cannot isolate unsafe dependencies"); 7650b57cec5SDimitry Andric } 7660b57cec5SDimitry Andric 7670b57cec5SDimitry Andric // Don't distribute the loop if we need too many SCEV run-time checks, or 7680b57cec5SDimitry Andric // any if it's illegal. 76981ad6265SDimitry Andric const SCEVPredicate &Pred = LAI->getPSE().getPredicate(); 7700b57cec5SDimitry Andric if (LAI->hasConvergentOp() && !Pred.isAlwaysTrue()) { 7710b57cec5SDimitry Andric return fail("RuntimeCheckWithConvergent", 7720b57cec5SDimitry Andric "may not insert runtime check with convergent operation"); 7730b57cec5SDimitry Andric } 7740b57cec5SDimitry Andric 77581ad6265SDimitry Andric if (Pred.getComplexity() > (IsForced.value_or(false) 7760b57cec5SDimitry Andric ? PragmaDistributeSCEVCheckThreshold 7770b57cec5SDimitry Andric : DistributeSCEVCheckThreshold)) 7780b57cec5SDimitry Andric return fail("TooManySCEVRuntimeChecks", 7790b57cec5SDimitry Andric "too many SCEV run-time checks needed.\n"); 7800b57cec5SDimitry Andric 78181ad6265SDimitry Andric if (!IsForced.value_or(false) && hasDisableAllTransformsHint(L)) 7820b57cec5SDimitry Andric return fail("HeuristicDisabled", "distribution heuristic disabled"); 7830b57cec5SDimitry Andric 784*0fca6ea1SDimitry Andric LLVM_DEBUG(dbgs() << "LDist: Distributing loop: " 785*0fca6ea1SDimitry Andric << L->getHeader()->getName() << "\n"); 7860b57cec5SDimitry Andric // We're done forming the partitions set up the reverse mapping from 7870b57cec5SDimitry Andric // instructions to partitions. 7880b57cec5SDimitry Andric Partitions.setupPartitionIdOnInstructions(); 7890b57cec5SDimitry Andric 7900b57cec5SDimitry Andric // If we need run-time checks, version the loop now. 7910b57cec5SDimitry Andric auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI); 7920b57cec5SDimitry Andric const auto *RtPtrChecking = LAI->getRuntimePointerChecking(); 7930b57cec5SDimitry Andric const auto &AllChecks = RtPtrChecking->getChecks(); 7940b57cec5SDimitry Andric auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition, 7950b57cec5SDimitry Andric RtPtrChecking); 7960b57cec5SDimitry Andric 7970b57cec5SDimitry Andric if (LAI->hasConvergentOp() && !Checks.empty()) { 7980b57cec5SDimitry Andric return fail("RuntimeCheckWithConvergent", 7990b57cec5SDimitry Andric "may not insert runtime check with convergent operation"); 8000b57cec5SDimitry Andric } 8010b57cec5SDimitry Andric 8025ffd83dbSDimitry Andric // To keep things simple have an empty preheader before we version or clone 8035ffd83dbSDimitry Andric // the loop. (Also split if this has no predecessor, i.e. entry, because we 8045ffd83dbSDimitry Andric // rely on PH having a predecessor.) 8055ffd83dbSDimitry Andric if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator()) 8065ffd83dbSDimitry Andric SplitBlock(PH, PH->getTerminator(), DT, LI); 8075ffd83dbSDimitry Andric 8080b57cec5SDimitry Andric if (!Pred.isAlwaysTrue() || !Checks.empty()) { 8090b57cec5SDimitry Andric assert(!LAI->hasConvergentOp() && "inserting illegal loop versioning"); 8100b57cec5SDimitry Andric 8110b57cec5SDimitry Andric MDNode *OrigLoopID = L->getLoopID(); 8120b57cec5SDimitry Andric 813*0fca6ea1SDimitry Andric LLVM_DEBUG(dbgs() << "LDist: Pointers:\n"); 8140b57cec5SDimitry Andric LLVM_DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks)); 815e8d8bef9SDimitry Andric LoopVersioning LVer(*LAI, Checks, L, LI, DT, SE); 8160b57cec5SDimitry Andric LVer.versionLoop(DefsUsedOutside); 8170b57cec5SDimitry Andric LVer.annotateLoopWithNoAlias(); 8180b57cec5SDimitry Andric 8190b57cec5SDimitry Andric // The unversioned loop will not be changed, so we inherit all attributes 8200b57cec5SDimitry Andric // from the original loop, but remove the loop distribution metadata to 8210b57cec5SDimitry Andric // avoid to distribute it again. 822bdd1243dSDimitry Andric MDNode *UnversionedLoopID = *makeFollowupLoopID( 823bdd1243dSDimitry Andric OrigLoopID, 824bdd1243dSDimitry Andric {LLVMLoopDistributeFollowupAll, LLVMLoopDistributeFollowupFallback}, 825bdd1243dSDimitry Andric "llvm.loop.distribute.", true); 8260b57cec5SDimitry Andric LVer.getNonVersionedLoop()->setLoopID(UnversionedLoopID); 8270b57cec5SDimitry Andric } 8280b57cec5SDimitry Andric 8290b57cec5SDimitry Andric // Create identical copies of the original loop for each partition and hook 8300b57cec5SDimitry Andric // them up sequentially. 8310b57cec5SDimitry Andric Partitions.cloneLoops(); 8320b57cec5SDimitry Andric 8330b57cec5SDimitry Andric // Now, we remove the instruction from each loop that don't belong to that 8340b57cec5SDimitry Andric // partition. 8350b57cec5SDimitry Andric Partitions.removeUnusedInsts(); 836*0fca6ea1SDimitry Andric LLVM_DEBUG(dbgs() << "LDist: After removing unused Instrs:\n"); 837*0fca6ea1SDimitry Andric LLVM_DEBUG(Partitions.printBlocks(dbgs())); 8380b57cec5SDimitry Andric 8390b57cec5SDimitry Andric if (LDistVerify) { 8400b57cec5SDimitry Andric LI->verify(*DT); 8410b57cec5SDimitry Andric assert(DT->verify(DominatorTree::VerificationLevel::Fast)); 8420b57cec5SDimitry Andric } 8430b57cec5SDimitry Andric 8440b57cec5SDimitry Andric ++NumLoopsDistributed; 8450b57cec5SDimitry Andric // Report the success. 8460b57cec5SDimitry Andric ORE->emit([&]() { 8470b57cec5SDimitry Andric return OptimizationRemark(LDIST_NAME, "Distribute", L->getStartLoc(), 8480b57cec5SDimitry Andric L->getHeader()) 8490b57cec5SDimitry Andric << "distributed loop"; 8500b57cec5SDimitry Andric }); 8510b57cec5SDimitry Andric return true; 8520b57cec5SDimitry Andric } 8530b57cec5SDimitry Andric 8540b57cec5SDimitry Andric /// Provide diagnostics then \return with false. 8550b57cec5SDimitry Andric bool fail(StringRef RemarkName, StringRef Message) { 8560b57cec5SDimitry Andric LLVMContext &Ctx = F->getContext(); 85781ad6265SDimitry Andric bool Forced = isForced().value_or(false); 8580b57cec5SDimitry Andric 859*0fca6ea1SDimitry Andric LLVM_DEBUG(dbgs() << "LDist: Skipping; " << Message << "\n"); 8600b57cec5SDimitry Andric 8610b57cec5SDimitry Andric // With Rpass-missed report that distribution failed. 8620b57cec5SDimitry Andric ORE->emit([&]() { 8630b57cec5SDimitry Andric return OptimizationRemarkMissed(LDIST_NAME, "NotDistributed", 8640b57cec5SDimitry Andric L->getStartLoc(), L->getHeader()) 8650b57cec5SDimitry Andric << "loop not distributed: use -Rpass-analysis=loop-distribute for " 8660b57cec5SDimitry Andric "more " 8670b57cec5SDimitry Andric "info"; 8680b57cec5SDimitry Andric }); 8690b57cec5SDimitry Andric 8700b57cec5SDimitry Andric // With Rpass-analysis report why. This is on by default if distribution 8710b57cec5SDimitry Andric // was requested explicitly. 8720b57cec5SDimitry Andric ORE->emit(OptimizationRemarkAnalysis( 8730b57cec5SDimitry Andric Forced ? OptimizationRemarkAnalysis::AlwaysPrint : LDIST_NAME, 8740b57cec5SDimitry Andric RemarkName, L->getStartLoc(), L->getHeader()) 8750b57cec5SDimitry Andric << "loop not distributed: " << Message); 8760b57cec5SDimitry Andric 8770b57cec5SDimitry Andric // Also issue a warning if distribution was requested explicitly but it 8780b57cec5SDimitry Andric // failed. 8790b57cec5SDimitry Andric if (Forced) 8800b57cec5SDimitry Andric Ctx.diagnose(DiagnosticInfoOptimizationFailure( 8810b57cec5SDimitry Andric *F, L->getStartLoc(), "loop not distributed: failed " 8820b57cec5SDimitry Andric "explicitly specified loop distribution")); 8830b57cec5SDimitry Andric 8840b57cec5SDimitry Andric return false; 8850b57cec5SDimitry Andric } 8860b57cec5SDimitry Andric 8870b57cec5SDimitry Andric /// Return if distribution forced to be enabled/disabled for the loop. 8880b57cec5SDimitry Andric /// 8890b57cec5SDimitry Andric /// If the optional has a value, it indicates whether distribution was forced 8900b57cec5SDimitry Andric /// to be enabled (true) or disabled (false). If the optional has no value 8910b57cec5SDimitry Andric /// distribution was not forced either way. 892bdd1243dSDimitry Andric const std::optional<bool> &isForced() const { return IsForced; } 8930b57cec5SDimitry Andric 8940b57cec5SDimitry Andric private: 8950b57cec5SDimitry Andric /// Filter out checks between pointers from the same partition. 8960b57cec5SDimitry Andric /// 8970b57cec5SDimitry Andric /// \p PtrToPartition contains the partition number for pointers. Partition 8980b57cec5SDimitry Andric /// number -1 means that the pointer is used in multiple partitions. In this 8990b57cec5SDimitry Andric /// case we can't safely omit the check. 9005ffd83dbSDimitry Andric SmallVector<RuntimePointerCheck, 4> includeOnlyCrossPartitionChecks( 9015ffd83dbSDimitry Andric const SmallVectorImpl<RuntimePointerCheck> &AllChecks, 9020b57cec5SDimitry Andric const SmallVectorImpl<int> &PtrToPartition, 9030b57cec5SDimitry Andric const RuntimePointerChecking *RtPtrChecking) { 9045ffd83dbSDimitry Andric SmallVector<RuntimePointerCheck, 4> Checks; 9050b57cec5SDimitry Andric 9060b57cec5SDimitry Andric copy_if(AllChecks, std::back_inserter(Checks), 9075ffd83dbSDimitry Andric [&](const RuntimePointerCheck &Check) { 9080b57cec5SDimitry Andric for (unsigned PtrIdx1 : Check.first->Members) 9090b57cec5SDimitry Andric for (unsigned PtrIdx2 : Check.second->Members) 9100b57cec5SDimitry Andric // Only include this check if there is a pair of pointers 9110b57cec5SDimitry Andric // that require checking and the pointers fall into 9120b57cec5SDimitry Andric // separate partitions. 9130b57cec5SDimitry Andric // 9140b57cec5SDimitry Andric // (Note that we already know at this point that the two 9150b57cec5SDimitry Andric // pointer groups need checking but it doesn't follow 9160b57cec5SDimitry Andric // that each pair of pointers within the two groups need 9170b57cec5SDimitry Andric // checking as well. 9180b57cec5SDimitry Andric // 9190b57cec5SDimitry Andric // In other words we don't want to include a check just 9200b57cec5SDimitry Andric // because there is a pair of pointers between the two 9210b57cec5SDimitry Andric // pointer groups that require checks and a different 9220b57cec5SDimitry Andric // pair whose pointers fall into different partitions.) 9230b57cec5SDimitry Andric if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) && 9240b57cec5SDimitry Andric !RuntimePointerChecking::arePointersInSamePartition( 9250b57cec5SDimitry Andric PtrToPartition, PtrIdx1, PtrIdx2)) 9260b57cec5SDimitry Andric return true; 9270b57cec5SDimitry Andric return false; 9280b57cec5SDimitry Andric }); 9290b57cec5SDimitry Andric 9300b57cec5SDimitry Andric return Checks; 9310b57cec5SDimitry Andric } 9320b57cec5SDimitry Andric 9330b57cec5SDimitry Andric /// Check whether the loop metadata is forcing distribution to be 9340b57cec5SDimitry Andric /// enabled/disabled. 9350b57cec5SDimitry Andric void setForced() { 936bdd1243dSDimitry Andric std::optional<const MDOperand *> Value = 9370b57cec5SDimitry Andric findStringMetadataForLoop(L, "llvm.loop.distribute.enable"); 9380b57cec5SDimitry Andric if (!Value) 9390b57cec5SDimitry Andric return; 9400b57cec5SDimitry Andric 9410b57cec5SDimitry Andric const MDOperand *Op = *Value; 9420b57cec5SDimitry Andric assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata"); 9430b57cec5SDimitry Andric IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue(); 9440b57cec5SDimitry Andric } 9450b57cec5SDimitry Andric 9460b57cec5SDimitry Andric Loop *L; 9470b57cec5SDimitry Andric Function *F; 9480b57cec5SDimitry Andric 9490b57cec5SDimitry Andric // Analyses used. 9500b57cec5SDimitry Andric LoopInfo *LI; 9510b57cec5SDimitry Andric const LoopAccessInfo *LAI = nullptr; 9520b57cec5SDimitry Andric DominatorTree *DT; 9530b57cec5SDimitry Andric ScalarEvolution *SE; 954bdd1243dSDimitry Andric LoopAccessInfoManager &LAIs; 9550b57cec5SDimitry Andric OptimizationRemarkEmitter *ORE; 9560b57cec5SDimitry Andric 9570b57cec5SDimitry Andric /// Indicates whether distribution is forced to be enabled/disabled for 9580b57cec5SDimitry Andric /// the loop. 9590b57cec5SDimitry Andric /// 9600b57cec5SDimitry Andric /// If the optional has a value, it indicates whether distribution was forced 9610b57cec5SDimitry Andric /// to be enabled (true) or disabled (false). If the optional has no value 9620b57cec5SDimitry Andric /// distribution was not forced either way. 963bdd1243dSDimitry Andric std::optional<bool> IsForced; 9640b57cec5SDimitry Andric }; 9650b57cec5SDimitry Andric 9660b57cec5SDimitry Andric } // end anonymous namespace 9670b57cec5SDimitry Andric 9680b57cec5SDimitry Andric static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT, 9690b57cec5SDimitry Andric ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, 970bdd1243dSDimitry Andric LoopAccessInfoManager &LAIs) { 971*0fca6ea1SDimitry Andric // Build up a worklist of inner-loops to distribute. This is necessary as the 9720b57cec5SDimitry Andric // act of distributing a loop creates new loops and can invalidate iterators 9730b57cec5SDimitry Andric // across the loops. 9740b57cec5SDimitry Andric SmallVector<Loop *, 8> Worklist; 9750b57cec5SDimitry Andric 9760b57cec5SDimitry Andric for (Loop *TopLevelLoop : *LI) 9770b57cec5SDimitry Andric for (Loop *L : depth_first(TopLevelLoop)) 9780b57cec5SDimitry Andric // We only handle inner-most loops. 979e8d8bef9SDimitry Andric if (L->isInnermost()) 9800b57cec5SDimitry Andric Worklist.push_back(L); 9810b57cec5SDimitry Andric 9820b57cec5SDimitry Andric // Now walk the identified inner loops. 9830b57cec5SDimitry Andric bool Changed = false; 9840b57cec5SDimitry Andric for (Loop *L : Worklist) { 985bdd1243dSDimitry Andric LoopDistributeForLoop LDL(L, &F, LI, DT, SE, LAIs, ORE); 9860b57cec5SDimitry Andric 9870b57cec5SDimitry Andric // If distribution was forced for the specific loop to be 9880b57cec5SDimitry Andric // enabled/disabled, follow that. Otherwise use the global flag. 98981ad6265SDimitry Andric if (LDL.isForced().value_or(EnableLoopDistribute)) 990bdd1243dSDimitry Andric Changed |= LDL.processLoop(); 9910b57cec5SDimitry Andric } 9920b57cec5SDimitry Andric 9930b57cec5SDimitry Andric // Process each loop nest in the function. 9940b57cec5SDimitry Andric return Changed; 9950b57cec5SDimitry Andric } 9960b57cec5SDimitry Andric 9970b57cec5SDimitry Andric PreservedAnalyses LoopDistributePass::run(Function &F, 9980b57cec5SDimitry Andric FunctionAnalysisManager &AM) { 9990b57cec5SDimitry Andric auto &LI = AM.getResult<LoopAnalysis>(F); 10000b57cec5SDimitry Andric auto &DT = AM.getResult<DominatorTreeAnalysis>(F); 10010b57cec5SDimitry Andric auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); 10020b57cec5SDimitry Andric auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F); 10030b57cec5SDimitry Andric 1004bdd1243dSDimitry Andric LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F); 1005bdd1243dSDimitry Andric bool Changed = runImpl(F, &LI, &DT, &SE, &ORE, LAIs); 10060b57cec5SDimitry Andric if (!Changed) 10070b57cec5SDimitry Andric return PreservedAnalyses::all(); 10080b57cec5SDimitry Andric PreservedAnalyses PA; 10090b57cec5SDimitry Andric PA.preserve<LoopAnalysis>(); 10100b57cec5SDimitry Andric PA.preserve<DominatorTreeAnalysis>(); 10110b57cec5SDimitry Andric return PA; 10120b57cec5SDimitry Andric } 1013