109467b48Spatrick //===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===//
209467b48Spatrick //
309467b48Spatrick // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
409467b48Spatrick // See https://llvm.org/LICENSE.txt for license information.
509467b48Spatrick // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
609467b48Spatrick //
709467b48Spatrick //===----------------------------------------------------------------------===//
809467b48Spatrick //
909467b48Spatrick // This file implements the Loop Distribution Pass. Its main focus is to
1009467b48Spatrick // distribute loops that cannot be vectorized due to dependence cycles. It
1109467b48Spatrick // tries to isolate the offending dependences into a new loop allowing
1209467b48Spatrick // vectorization of the remaining parts.
1309467b48Spatrick //
1409467b48Spatrick // For dependence analysis, the pass uses the LoopVectorizer's
1509467b48Spatrick // LoopAccessAnalysis. Because this analysis presumes no change in the order of
1609467b48Spatrick // memory operations, special care is taken to preserve the lexical order of
1709467b48Spatrick // these operations.
1809467b48Spatrick //
1909467b48Spatrick // Similarly to the Vectorizer, the pass also supports loop versioning to
2009467b48Spatrick // run-time disambiguate potentially overlapping arrays.
2109467b48Spatrick //
2209467b48Spatrick //===----------------------------------------------------------------------===//
2309467b48Spatrick
2409467b48Spatrick #include "llvm/Transforms/Scalar/LoopDistribute.h"
2509467b48Spatrick #include "llvm/ADT/DenseMap.h"
2609467b48Spatrick #include "llvm/ADT/DepthFirstIterator.h"
2709467b48Spatrick #include "llvm/ADT/EquivalenceClasses.h"
2809467b48Spatrick #include "llvm/ADT/STLExtras.h"
2909467b48Spatrick #include "llvm/ADT/SmallPtrSet.h"
3009467b48Spatrick #include "llvm/ADT/SmallVector.h"
3109467b48Spatrick #include "llvm/ADT/Statistic.h"
3209467b48Spatrick #include "llvm/ADT/StringRef.h"
3309467b48Spatrick #include "llvm/ADT/Twine.h"
3409467b48Spatrick #include "llvm/ADT/iterator_range.h"
3509467b48Spatrick #include "llvm/Analysis/AssumptionCache.h"
3609467b48Spatrick #include "llvm/Analysis/GlobalsModRef.h"
3709467b48Spatrick #include "llvm/Analysis/LoopAccessAnalysis.h"
3809467b48Spatrick #include "llvm/Analysis/LoopAnalysisManager.h"
3909467b48Spatrick #include "llvm/Analysis/LoopInfo.h"
4009467b48Spatrick #include "llvm/Analysis/OptimizationRemarkEmitter.h"
4109467b48Spatrick #include "llvm/Analysis/ScalarEvolution.h"
4209467b48Spatrick #include "llvm/Analysis/TargetLibraryInfo.h"
4309467b48Spatrick #include "llvm/Analysis/TargetTransformInfo.h"
4409467b48Spatrick #include "llvm/IR/BasicBlock.h"
4509467b48Spatrick #include "llvm/IR/Constants.h"
4609467b48Spatrick #include "llvm/IR/DiagnosticInfo.h"
4709467b48Spatrick #include "llvm/IR/Dominators.h"
4809467b48Spatrick #include "llvm/IR/Function.h"
4909467b48Spatrick #include "llvm/IR/Instruction.h"
5009467b48Spatrick #include "llvm/IR/Instructions.h"
5109467b48Spatrick #include "llvm/IR/LLVMContext.h"
5209467b48Spatrick #include "llvm/IR/Metadata.h"
5309467b48Spatrick #include "llvm/IR/PassManager.h"
5409467b48Spatrick #include "llvm/IR/Value.h"
5509467b48Spatrick #include "llvm/InitializePasses.h"
5609467b48Spatrick #include "llvm/Pass.h"
5709467b48Spatrick #include "llvm/Support/Casting.h"
5809467b48Spatrick #include "llvm/Support/CommandLine.h"
5909467b48Spatrick #include "llvm/Support/Debug.h"
6009467b48Spatrick #include "llvm/Support/raw_ostream.h"
6109467b48Spatrick #include "llvm/Transforms/Scalar.h"
6209467b48Spatrick #include "llvm/Transforms/Utils/BasicBlockUtils.h"
6309467b48Spatrick #include "llvm/Transforms/Utils/Cloning.h"
6409467b48Spatrick #include "llvm/Transforms/Utils/LoopUtils.h"
6509467b48Spatrick #include "llvm/Transforms/Utils/LoopVersioning.h"
6609467b48Spatrick #include "llvm/Transforms/Utils/ValueMapper.h"
6709467b48Spatrick #include <cassert>
6809467b48Spatrick #include <functional>
6909467b48Spatrick #include <list>
7009467b48Spatrick #include <tuple>
7109467b48Spatrick #include <utility>
7209467b48Spatrick
7309467b48Spatrick using namespace llvm;
7409467b48Spatrick
7509467b48Spatrick #define LDIST_NAME "loop-distribute"
7609467b48Spatrick #define DEBUG_TYPE LDIST_NAME
7709467b48Spatrick
7809467b48Spatrick /// @{
7909467b48Spatrick /// Metadata attribute names
8009467b48Spatrick static const char *const LLVMLoopDistributeFollowupAll =
8109467b48Spatrick "llvm.loop.distribute.followup_all";
8209467b48Spatrick static const char *const LLVMLoopDistributeFollowupCoincident =
8309467b48Spatrick "llvm.loop.distribute.followup_coincident";
8409467b48Spatrick static const char *const LLVMLoopDistributeFollowupSequential =
8509467b48Spatrick "llvm.loop.distribute.followup_sequential";
8609467b48Spatrick static const char *const LLVMLoopDistributeFollowupFallback =
8709467b48Spatrick "llvm.loop.distribute.followup_fallback";
8809467b48Spatrick /// @}
8909467b48Spatrick
9009467b48Spatrick static cl::opt<bool>
9109467b48Spatrick LDistVerify("loop-distribute-verify", cl::Hidden,
9209467b48Spatrick cl::desc("Turn on DominatorTree and LoopInfo verification "
9309467b48Spatrick "after Loop Distribution"),
9409467b48Spatrick cl::init(false));
9509467b48Spatrick
9609467b48Spatrick static cl::opt<bool> DistributeNonIfConvertible(
9709467b48Spatrick "loop-distribute-non-if-convertible", cl::Hidden,
9809467b48Spatrick cl::desc("Whether to distribute into a loop that may not be "
9909467b48Spatrick "if-convertible by the loop vectorizer"),
10009467b48Spatrick cl::init(false));
10109467b48Spatrick
10209467b48Spatrick static cl::opt<unsigned> DistributeSCEVCheckThreshold(
10309467b48Spatrick "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden,
10409467b48Spatrick cl::desc("The maximum number of SCEV checks allowed for Loop "
10509467b48Spatrick "Distribution"));
10609467b48Spatrick
10709467b48Spatrick static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold(
10809467b48Spatrick "loop-distribute-scev-check-threshold-with-pragma", cl::init(128),
10909467b48Spatrick cl::Hidden,
11009467b48Spatrick cl::desc(
11109467b48Spatrick "The maximum number of SCEV checks allowed for Loop "
11209467b48Spatrick "Distribution for loop marked with #pragma loop distribute(enable)"));
11309467b48Spatrick
11409467b48Spatrick static cl::opt<bool> EnableLoopDistribute(
11509467b48Spatrick "enable-loop-distribute", cl::Hidden,
11609467b48Spatrick cl::desc("Enable the new, experimental LoopDistribution Pass"),
11709467b48Spatrick cl::init(false));
11809467b48Spatrick
11909467b48Spatrick STATISTIC(NumLoopsDistributed, "Number of loops distributed");
12009467b48Spatrick
12109467b48Spatrick namespace {
12209467b48Spatrick
12309467b48Spatrick /// Maintains the set of instructions of the loop for a partition before
12409467b48Spatrick /// cloning. After cloning, it hosts the new loop.
12509467b48Spatrick class InstPartition {
12609467b48Spatrick using InstructionSet = SmallPtrSet<Instruction *, 8>;
12709467b48Spatrick
12809467b48Spatrick public:
InstPartition(Instruction * I,Loop * L,bool DepCycle=false)12909467b48Spatrick InstPartition(Instruction *I, Loop *L, bool DepCycle = false)
13009467b48Spatrick : DepCycle(DepCycle), OrigLoop(L) {
13109467b48Spatrick Set.insert(I);
13209467b48Spatrick }
13309467b48Spatrick
13409467b48Spatrick /// Returns whether this partition contains a dependence cycle.
hasDepCycle() const13509467b48Spatrick bool hasDepCycle() const { return DepCycle; }
13609467b48Spatrick
13709467b48Spatrick /// Adds an instruction to this partition.
add(Instruction * I)13809467b48Spatrick void add(Instruction *I) { Set.insert(I); }
13909467b48Spatrick
14009467b48Spatrick /// Collection accessors.
begin()14109467b48Spatrick InstructionSet::iterator begin() { return Set.begin(); }
end()14209467b48Spatrick InstructionSet::iterator end() { return Set.end(); }
begin() const14309467b48Spatrick InstructionSet::const_iterator begin() const { return Set.begin(); }
end() const14409467b48Spatrick InstructionSet::const_iterator end() const { return Set.end(); }
empty() const14509467b48Spatrick bool empty() const { return Set.empty(); }
14609467b48Spatrick
14709467b48Spatrick /// Moves this partition into \p Other. This partition becomes empty
14809467b48Spatrick /// after this.
moveTo(InstPartition & Other)14909467b48Spatrick void moveTo(InstPartition &Other) {
15009467b48Spatrick Other.Set.insert(Set.begin(), Set.end());
15109467b48Spatrick Set.clear();
15209467b48Spatrick Other.DepCycle |= DepCycle;
15309467b48Spatrick }
15409467b48Spatrick
15509467b48Spatrick /// Populates the partition with a transitive closure of all the
15609467b48Spatrick /// instructions that the seeded instructions dependent on.
populateUsedSet()15709467b48Spatrick void populateUsedSet() {
15809467b48Spatrick // FIXME: We currently don't use control-dependence but simply include all
15909467b48Spatrick // blocks (possibly empty at the end) and let simplifycfg mostly clean this
16009467b48Spatrick // up.
16109467b48Spatrick for (auto *B : OrigLoop->getBlocks())
16209467b48Spatrick Set.insert(B->getTerminator());
16309467b48Spatrick
16409467b48Spatrick // Follow the use-def chains to form a transitive closure of all the
16509467b48Spatrick // instructions that the originally seeded instructions depend on.
16609467b48Spatrick SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end());
16709467b48Spatrick while (!Worklist.empty()) {
16809467b48Spatrick Instruction *I = Worklist.pop_back_val();
16909467b48Spatrick // Insert instructions from the loop that we depend on.
17009467b48Spatrick for (Value *V : I->operand_values()) {
17109467b48Spatrick auto *I = dyn_cast<Instruction>(V);
17209467b48Spatrick if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second)
17309467b48Spatrick Worklist.push_back(I);
17409467b48Spatrick }
17509467b48Spatrick }
17609467b48Spatrick }
17709467b48Spatrick
17809467b48Spatrick /// Clones the original loop.
17909467b48Spatrick ///
18009467b48Spatrick /// Updates LoopInfo and DominatorTree using the information that block \p
18109467b48Spatrick /// LoopDomBB dominates the loop.
cloneLoopWithPreheader(BasicBlock * InsertBefore,BasicBlock * LoopDomBB,unsigned Index,LoopInfo * LI,DominatorTree * DT)18209467b48Spatrick Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB,
18309467b48Spatrick unsigned Index, LoopInfo *LI,
18409467b48Spatrick DominatorTree *DT) {
18509467b48Spatrick ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop,
18609467b48Spatrick VMap, Twine(".ldist") + Twine(Index),
18709467b48Spatrick LI, DT, ClonedLoopBlocks);
18809467b48Spatrick return ClonedLoop;
18909467b48Spatrick }
19009467b48Spatrick
19109467b48Spatrick /// The cloned loop. If this partition is mapped to the original loop,
19209467b48Spatrick /// this is null.
getClonedLoop() const19309467b48Spatrick const Loop *getClonedLoop() const { return ClonedLoop; }
19409467b48Spatrick
19509467b48Spatrick /// Returns the loop where this partition ends up after distribution.
19609467b48Spatrick /// If this partition is mapped to the original loop then use the block from
19709467b48Spatrick /// the loop.
getDistributedLoop() const19809467b48Spatrick Loop *getDistributedLoop() const {
19909467b48Spatrick return ClonedLoop ? ClonedLoop : OrigLoop;
20009467b48Spatrick }
20109467b48Spatrick
20209467b48Spatrick /// The VMap that is populated by cloning and then used in
20309467b48Spatrick /// remapinstruction to remap the cloned instructions.
getVMap()20409467b48Spatrick ValueToValueMapTy &getVMap() { return VMap; }
20509467b48Spatrick
20609467b48Spatrick /// Remaps the cloned instructions using VMap.
remapInstructions()20709467b48Spatrick void remapInstructions() {
20809467b48Spatrick remapInstructionsInBlocks(ClonedLoopBlocks, VMap);
20909467b48Spatrick }
21009467b48Spatrick
21109467b48Spatrick /// Based on the set of instructions selected for this partition,
21209467b48Spatrick /// removes the unnecessary ones.
removeUnusedInsts()21309467b48Spatrick void removeUnusedInsts() {
21409467b48Spatrick SmallVector<Instruction *, 8> Unused;
21509467b48Spatrick
21609467b48Spatrick for (auto *Block : OrigLoop->getBlocks())
21709467b48Spatrick for (auto &Inst : *Block)
21809467b48Spatrick if (!Set.count(&Inst)) {
21909467b48Spatrick Instruction *NewInst = &Inst;
22009467b48Spatrick if (!VMap.empty())
22109467b48Spatrick NewInst = cast<Instruction>(VMap[NewInst]);
22209467b48Spatrick
22309467b48Spatrick assert(!isa<BranchInst>(NewInst) &&
22409467b48Spatrick "Branches are marked used early on");
22509467b48Spatrick Unused.push_back(NewInst);
22609467b48Spatrick }
22709467b48Spatrick
22809467b48Spatrick // Delete the instructions backwards, as it has a reduced likelihood of
22909467b48Spatrick // having to update as many def-use and use-def chains.
23009467b48Spatrick for (auto *Inst : reverse(Unused)) {
23109467b48Spatrick if (!Inst->use_empty())
232*d415bd75Srobert Inst->replaceAllUsesWith(PoisonValue::get(Inst->getType()));
23309467b48Spatrick Inst->eraseFromParent();
23409467b48Spatrick }
23509467b48Spatrick }
23609467b48Spatrick
print() const23709467b48Spatrick void print() const {
23809467b48Spatrick if (DepCycle)
23909467b48Spatrick dbgs() << " (cycle)\n";
24009467b48Spatrick for (auto *I : Set)
24109467b48Spatrick // Prefix with the block name.
24209467b48Spatrick dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n";
24309467b48Spatrick }
24409467b48Spatrick
printBlocks() const24509467b48Spatrick void printBlocks() const {
24609467b48Spatrick for (auto *BB : getDistributedLoop()->getBlocks())
24709467b48Spatrick dbgs() << *BB;
24809467b48Spatrick }
24909467b48Spatrick
25009467b48Spatrick private:
25109467b48Spatrick /// Instructions from OrigLoop selected for this partition.
25209467b48Spatrick InstructionSet Set;
25309467b48Spatrick
25409467b48Spatrick /// Whether this partition contains a dependence cycle.
25509467b48Spatrick bool DepCycle;
25609467b48Spatrick
25709467b48Spatrick /// The original loop.
25809467b48Spatrick Loop *OrigLoop;
25909467b48Spatrick
26009467b48Spatrick /// The cloned loop. If this partition is mapped to the original loop,
26109467b48Spatrick /// this is null.
26209467b48Spatrick Loop *ClonedLoop = nullptr;
26309467b48Spatrick
26409467b48Spatrick /// The blocks of ClonedLoop including the preheader. If this
26509467b48Spatrick /// partition is mapped to the original loop, this is empty.
26609467b48Spatrick SmallVector<BasicBlock *, 8> ClonedLoopBlocks;
26709467b48Spatrick
26809467b48Spatrick /// These gets populated once the set of instructions have been
26909467b48Spatrick /// finalized. If this partition is mapped to the original loop, these are not
27009467b48Spatrick /// set.
27109467b48Spatrick ValueToValueMapTy VMap;
27209467b48Spatrick };
27309467b48Spatrick
27409467b48Spatrick /// Holds the set of Partitions. It populates them, merges them and then
27509467b48Spatrick /// clones the loops.
27609467b48Spatrick class InstPartitionContainer {
27709467b48Spatrick using InstToPartitionIdT = DenseMap<Instruction *, int>;
27809467b48Spatrick
27909467b48Spatrick public:
InstPartitionContainer(Loop * L,LoopInfo * LI,DominatorTree * DT)28009467b48Spatrick InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT)
28109467b48Spatrick : L(L), LI(LI), DT(DT) {}
28209467b48Spatrick
28309467b48Spatrick /// Returns the number of partitions.
getSize() const28409467b48Spatrick unsigned getSize() const { return PartitionContainer.size(); }
28509467b48Spatrick
28609467b48Spatrick /// Adds \p Inst into the current partition if that is marked to
28709467b48Spatrick /// contain cycles. Otherwise start a new partition for it.
addToCyclicPartition(Instruction * Inst)28809467b48Spatrick void addToCyclicPartition(Instruction *Inst) {
28909467b48Spatrick // If the current partition is non-cyclic. Start a new one.
29009467b48Spatrick if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle())
29109467b48Spatrick PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true);
29209467b48Spatrick else
29309467b48Spatrick PartitionContainer.back().add(Inst);
29409467b48Spatrick }
29509467b48Spatrick
29609467b48Spatrick /// Adds \p Inst into a partition that is not marked to contain
29709467b48Spatrick /// dependence cycles.
29809467b48Spatrick ///
29909467b48Spatrick // Initially we isolate memory instructions into as many partitions as
30009467b48Spatrick // possible, then later we may merge them back together.
addToNewNonCyclicPartition(Instruction * Inst)30109467b48Spatrick void addToNewNonCyclicPartition(Instruction *Inst) {
30209467b48Spatrick PartitionContainer.emplace_back(Inst, L);
30309467b48Spatrick }
30409467b48Spatrick
30509467b48Spatrick /// Merges adjacent non-cyclic partitions.
30609467b48Spatrick ///
30709467b48Spatrick /// The idea is that we currently only want to isolate the non-vectorizable
30809467b48Spatrick /// partition. We could later allow more distribution among these partition
30909467b48Spatrick /// too.
mergeAdjacentNonCyclic()31009467b48Spatrick void mergeAdjacentNonCyclic() {
31109467b48Spatrick mergeAdjacentPartitionsIf(
31209467b48Spatrick [](const InstPartition *P) { return !P->hasDepCycle(); });
31309467b48Spatrick }
31409467b48Spatrick
31509467b48Spatrick /// If a partition contains only conditional stores, we won't vectorize
31609467b48Spatrick /// it. Try to merge it with a previous cyclic partition.
mergeNonIfConvertible()31709467b48Spatrick void mergeNonIfConvertible() {
31809467b48Spatrick mergeAdjacentPartitionsIf([&](const InstPartition *Partition) {
31909467b48Spatrick if (Partition->hasDepCycle())
32009467b48Spatrick return true;
32109467b48Spatrick
32209467b48Spatrick // Now, check if all stores are conditional in this partition.
32309467b48Spatrick bool seenStore = false;
32409467b48Spatrick
32509467b48Spatrick for (auto *Inst : *Partition)
32609467b48Spatrick if (isa<StoreInst>(Inst)) {
32709467b48Spatrick seenStore = true;
32809467b48Spatrick if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT))
32909467b48Spatrick return false;
33009467b48Spatrick }
33109467b48Spatrick return seenStore;
33209467b48Spatrick });
33309467b48Spatrick }
33409467b48Spatrick
33509467b48Spatrick /// Merges the partitions according to various heuristics.
mergeBeforePopulating()33609467b48Spatrick void mergeBeforePopulating() {
33709467b48Spatrick mergeAdjacentNonCyclic();
33809467b48Spatrick if (!DistributeNonIfConvertible)
33909467b48Spatrick mergeNonIfConvertible();
34009467b48Spatrick }
34109467b48Spatrick
34209467b48Spatrick /// Merges partitions in order to ensure that no loads are duplicated.
34309467b48Spatrick ///
34409467b48Spatrick /// We can't duplicate loads because that could potentially reorder them.
34509467b48Spatrick /// LoopAccessAnalysis provides dependency information with the context that
34609467b48Spatrick /// the order of memory operation is preserved.
34709467b48Spatrick ///
34809467b48Spatrick /// Return if any partitions were merged.
mergeToAvoidDuplicatedLoads()34909467b48Spatrick bool mergeToAvoidDuplicatedLoads() {
35009467b48Spatrick using LoadToPartitionT = DenseMap<Instruction *, InstPartition *>;
35109467b48Spatrick using ToBeMergedT = EquivalenceClasses<InstPartition *>;
35209467b48Spatrick
35309467b48Spatrick LoadToPartitionT LoadToPartition;
35409467b48Spatrick ToBeMergedT ToBeMerged;
35509467b48Spatrick
35609467b48Spatrick // Step through the partitions and create equivalence between partitions
35709467b48Spatrick // that contain the same load. Also put partitions in between them in the
35809467b48Spatrick // same equivalence class to avoid reordering of memory operations.
35909467b48Spatrick for (PartitionContainerT::iterator I = PartitionContainer.begin(),
36009467b48Spatrick E = PartitionContainer.end();
36109467b48Spatrick I != E; ++I) {
36209467b48Spatrick auto *PartI = &*I;
36309467b48Spatrick
36409467b48Spatrick // If a load occurs in two partitions PartI and PartJ, merge all
36509467b48Spatrick // partitions (PartI, PartJ] into PartI.
36609467b48Spatrick for (Instruction *Inst : *PartI)
36709467b48Spatrick if (isa<LoadInst>(Inst)) {
36809467b48Spatrick bool NewElt;
36909467b48Spatrick LoadToPartitionT::iterator LoadToPart;
37009467b48Spatrick
37109467b48Spatrick std::tie(LoadToPart, NewElt) =
37209467b48Spatrick LoadToPartition.insert(std::make_pair(Inst, PartI));
37309467b48Spatrick if (!NewElt) {
37409467b48Spatrick LLVM_DEBUG(dbgs()
37509467b48Spatrick << "Merging partitions due to this load in multiple "
37609467b48Spatrick << "partitions: " << PartI << ", " << LoadToPart->second
37709467b48Spatrick << "\n"
37809467b48Spatrick << *Inst << "\n");
37909467b48Spatrick
38009467b48Spatrick auto PartJ = I;
38109467b48Spatrick do {
38209467b48Spatrick --PartJ;
38309467b48Spatrick ToBeMerged.unionSets(PartI, &*PartJ);
38409467b48Spatrick } while (&*PartJ != LoadToPart->second);
38509467b48Spatrick }
38609467b48Spatrick }
38709467b48Spatrick }
38809467b48Spatrick if (ToBeMerged.empty())
38909467b48Spatrick return false;
39009467b48Spatrick
39109467b48Spatrick // Merge the member of an equivalence class into its class leader. This
39209467b48Spatrick // makes the members empty.
39309467b48Spatrick for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end();
39409467b48Spatrick I != E; ++I) {
39509467b48Spatrick if (!I->isLeader())
39609467b48Spatrick continue;
39709467b48Spatrick
39809467b48Spatrick auto PartI = I->getData();
399*d415bd75Srobert for (auto *PartJ : make_range(std::next(ToBeMerged.member_begin(I)),
40009467b48Spatrick ToBeMerged.member_end())) {
40109467b48Spatrick PartJ->moveTo(*PartI);
40209467b48Spatrick }
40309467b48Spatrick }
40409467b48Spatrick
40509467b48Spatrick // Remove the empty partitions.
40609467b48Spatrick PartitionContainer.remove_if(
40709467b48Spatrick [](const InstPartition &P) { return P.empty(); });
40809467b48Spatrick
40909467b48Spatrick return true;
41009467b48Spatrick }
41109467b48Spatrick
41209467b48Spatrick /// Sets up the mapping between instructions to partitions. If the
41309467b48Spatrick /// instruction is duplicated across multiple partitions, set the entry to -1.
setupPartitionIdOnInstructions()41409467b48Spatrick void setupPartitionIdOnInstructions() {
41509467b48Spatrick int PartitionID = 0;
41609467b48Spatrick for (const auto &Partition : PartitionContainer) {
41709467b48Spatrick for (Instruction *Inst : Partition) {
41809467b48Spatrick bool NewElt;
41909467b48Spatrick InstToPartitionIdT::iterator Iter;
42009467b48Spatrick
42109467b48Spatrick std::tie(Iter, NewElt) =
42209467b48Spatrick InstToPartitionId.insert(std::make_pair(Inst, PartitionID));
42309467b48Spatrick if (!NewElt)
42409467b48Spatrick Iter->second = -1;
42509467b48Spatrick }
42609467b48Spatrick ++PartitionID;
42709467b48Spatrick }
42809467b48Spatrick }
42909467b48Spatrick
43009467b48Spatrick /// Populates the partition with everything that the seeding
43109467b48Spatrick /// instructions require.
populateUsedSet()43209467b48Spatrick void populateUsedSet() {
43309467b48Spatrick for (auto &P : PartitionContainer)
43409467b48Spatrick P.populateUsedSet();
43509467b48Spatrick }
43609467b48Spatrick
43709467b48Spatrick /// This performs the main chunk of the work of cloning the loops for
43809467b48Spatrick /// the partitions.
cloneLoops()43909467b48Spatrick void cloneLoops() {
44009467b48Spatrick BasicBlock *OrigPH = L->getLoopPreheader();
44109467b48Spatrick // At this point the predecessor of the preheader is either the memcheck
44209467b48Spatrick // block or the top part of the original preheader.
44309467b48Spatrick BasicBlock *Pred = OrigPH->getSinglePredecessor();
44409467b48Spatrick assert(Pred && "Preheader does not have a single predecessor");
44509467b48Spatrick BasicBlock *ExitBlock = L->getExitBlock();
44609467b48Spatrick assert(ExitBlock && "No single exit block");
44709467b48Spatrick Loop *NewLoop;
44809467b48Spatrick
44909467b48Spatrick assert(!PartitionContainer.empty() && "at least two partitions expected");
45009467b48Spatrick // We're cloning the preheader along with the loop so we already made sure
45109467b48Spatrick // it was empty.
45209467b48Spatrick assert(&*OrigPH->begin() == OrigPH->getTerminator() &&
45309467b48Spatrick "preheader not empty");
45409467b48Spatrick
45509467b48Spatrick // Preserve the original loop ID for use after the transformation.
45609467b48Spatrick MDNode *OrigLoopID = L->getLoopID();
45709467b48Spatrick
45809467b48Spatrick // Create a loop for each partition except the last. Clone the original
45909467b48Spatrick // loop before PH along with adding a preheader for the cloned loop. Then
46009467b48Spatrick // update PH to point to the newly added preheader.
46109467b48Spatrick BasicBlock *TopPH = OrigPH;
46209467b48Spatrick unsigned Index = getSize() - 1;
463*d415bd75Srobert for (auto &Part : llvm::drop_begin(llvm::reverse(PartitionContainer))) {
464*d415bd75Srobert NewLoop = Part.cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT);
46509467b48Spatrick
466*d415bd75Srobert Part.getVMap()[ExitBlock] = TopPH;
467*d415bd75Srobert Part.remapInstructions();
468*d415bd75Srobert setNewLoopID(OrigLoopID, &Part);
469*d415bd75Srobert --Index;
470*d415bd75Srobert TopPH = NewLoop->getLoopPreheader();
47109467b48Spatrick }
47209467b48Spatrick Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH);
47309467b48Spatrick
47409467b48Spatrick // Also set a new loop ID for the last loop.
47509467b48Spatrick setNewLoopID(OrigLoopID, &PartitionContainer.back());
47609467b48Spatrick
47709467b48Spatrick // Now go in forward order and update the immediate dominator for the
47809467b48Spatrick // preheaders with the exiting block of the previous loop. Dominance
47909467b48Spatrick // within the loop is updated in cloneLoopWithPreheader.
48009467b48Spatrick for (auto Curr = PartitionContainer.cbegin(),
48109467b48Spatrick Next = std::next(PartitionContainer.cbegin()),
48209467b48Spatrick E = PartitionContainer.cend();
48309467b48Spatrick Next != E; ++Curr, ++Next)
48409467b48Spatrick DT->changeImmediateDominator(
48509467b48Spatrick Next->getDistributedLoop()->getLoopPreheader(),
48609467b48Spatrick Curr->getDistributedLoop()->getExitingBlock());
48709467b48Spatrick }
48809467b48Spatrick
48909467b48Spatrick /// Removes the dead instructions from the cloned loops.
removeUnusedInsts()49009467b48Spatrick void removeUnusedInsts() {
49109467b48Spatrick for (auto &Partition : PartitionContainer)
49209467b48Spatrick Partition.removeUnusedInsts();
49309467b48Spatrick }
49409467b48Spatrick
49509467b48Spatrick /// For each memory pointer, it computes the partitionId the pointer is
49609467b48Spatrick /// used in.
49709467b48Spatrick ///
49809467b48Spatrick /// This returns an array of int where the I-th entry corresponds to I-th
49909467b48Spatrick /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple
50009467b48Spatrick /// partitions its entry is set to -1.
50109467b48Spatrick SmallVector<int, 8>
computePartitionSetForPointers(const LoopAccessInfo & LAI)50209467b48Spatrick computePartitionSetForPointers(const LoopAccessInfo &LAI) {
50309467b48Spatrick const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking();
50409467b48Spatrick
50509467b48Spatrick unsigned N = RtPtrCheck->Pointers.size();
50609467b48Spatrick SmallVector<int, 8> PtrToPartitions(N);
50709467b48Spatrick for (unsigned I = 0; I < N; ++I) {
50809467b48Spatrick Value *Ptr = RtPtrCheck->Pointers[I].PointerValue;
50909467b48Spatrick auto Instructions =
51009467b48Spatrick LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr);
51109467b48Spatrick
51209467b48Spatrick int &Partition = PtrToPartitions[I];
51309467b48Spatrick // First set it to uninitialized.
51409467b48Spatrick Partition = -2;
51509467b48Spatrick for (Instruction *Inst : Instructions) {
51609467b48Spatrick // Note that this could be -1 if Inst is duplicated across multiple
51709467b48Spatrick // partitions.
51809467b48Spatrick int ThisPartition = this->InstToPartitionId[Inst];
51909467b48Spatrick if (Partition == -2)
52009467b48Spatrick Partition = ThisPartition;
52109467b48Spatrick // -1 means belonging to multiple partitions.
52209467b48Spatrick else if (Partition == -1)
52309467b48Spatrick break;
52409467b48Spatrick else if (Partition != (int)ThisPartition)
52509467b48Spatrick Partition = -1;
52609467b48Spatrick }
52709467b48Spatrick assert(Partition != -2 && "Pointer not belonging to any partition");
52809467b48Spatrick }
52909467b48Spatrick
53009467b48Spatrick return PtrToPartitions;
53109467b48Spatrick }
53209467b48Spatrick
print(raw_ostream & OS) const53309467b48Spatrick void print(raw_ostream &OS) const {
53409467b48Spatrick unsigned Index = 0;
53509467b48Spatrick for (const auto &P : PartitionContainer) {
53609467b48Spatrick OS << "Partition " << Index++ << " (" << &P << "):\n";
53709467b48Spatrick P.print();
53809467b48Spatrick }
53909467b48Spatrick }
54009467b48Spatrick
dump() const54109467b48Spatrick void dump() const { print(dbgs()); }
54209467b48Spatrick
54309467b48Spatrick #ifndef NDEBUG
operator <<(raw_ostream & OS,const InstPartitionContainer & Partitions)54409467b48Spatrick friend raw_ostream &operator<<(raw_ostream &OS,
54509467b48Spatrick const InstPartitionContainer &Partitions) {
54609467b48Spatrick Partitions.print(OS);
54709467b48Spatrick return OS;
54809467b48Spatrick }
54909467b48Spatrick #endif
55009467b48Spatrick
printBlocks() const55109467b48Spatrick void printBlocks() const {
55209467b48Spatrick unsigned Index = 0;
55309467b48Spatrick for (const auto &P : PartitionContainer) {
55409467b48Spatrick dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n";
55509467b48Spatrick P.printBlocks();
55609467b48Spatrick }
55709467b48Spatrick }
55809467b48Spatrick
55909467b48Spatrick private:
56009467b48Spatrick using PartitionContainerT = std::list<InstPartition>;
56109467b48Spatrick
56209467b48Spatrick /// List of partitions.
56309467b48Spatrick PartitionContainerT PartitionContainer;
56409467b48Spatrick
56509467b48Spatrick /// Mapping from Instruction to partition Id. If the instruction
56609467b48Spatrick /// belongs to multiple partitions the entry contains -1.
56709467b48Spatrick InstToPartitionIdT InstToPartitionId;
56809467b48Spatrick
56909467b48Spatrick Loop *L;
57009467b48Spatrick LoopInfo *LI;
57109467b48Spatrick DominatorTree *DT;
57209467b48Spatrick
57309467b48Spatrick /// The control structure to merge adjacent partitions if both satisfy
57409467b48Spatrick /// the \p Predicate.
57509467b48Spatrick template <class UnaryPredicate>
mergeAdjacentPartitionsIf(UnaryPredicate Predicate)57609467b48Spatrick void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) {
57709467b48Spatrick InstPartition *PrevMatch = nullptr;
57809467b48Spatrick for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) {
57909467b48Spatrick auto DoesMatch = Predicate(&*I);
58009467b48Spatrick if (PrevMatch == nullptr && DoesMatch) {
58109467b48Spatrick PrevMatch = &*I;
58209467b48Spatrick ++I;
58309467b48Spatrick } else if (PrevMatch != nullptr && DoesMatch) {
58409467b48Spatrick I->moveTo(*PrevMatch);
58509467b48Spatrick I = PartitionContainer.erase(I);
58609467b48Spatrick } else {
58709467b48Spatrick PrevMatch = nullptr;
58809467b48Spatrick ++I;
58909467b48Spatrick }
59009467b48Spatrick }
59109467b48Spatrick }
59209467b48Spatrick
59309467b48Spatrick /// Assign new LoopIDs for the partition's cloned loop.
setNewLoopID(MDNode * OrigLoopID,InstPartition * Part)59409467b48Spatrick void setNewLoopID(MDNode *OrigLoopID, InstPartition *Part) {
595*d415bd75Srobert std::optional<MDNode *> PartitionID = makeFollowupLoopID(
59609467b48Spatrick OrigLoopID,
59709467b48Spatrick {LLVMLoopDistributeFollowupAll,
59809467b48Spatrick Part->hasDepCycle() ? LLVMLoopDistributeFollowupSequential
59909467b48Spatrick : LLVMLoopDistributeFollowupCoincident});
600*d415bd75Srobert if (PartitionID) {
60109467b48Spatrick Loop *NewLoop = Part->getDistributedLoop();
602*d415bd75Srobert NewLoop->setLoopID(*PartitionID);
60309467b48Spatrick }
60409467b48Spatrick }
60509467b48Spatrick };
60609467b48Spatrick
60709467b48Spatrick /// For each memory instruction, this class maintains difference of the
60809467b48Spatrick /// number of unsafe dependences that start out from this instruction minus
60909467b48Spatrick /// those that end here.
61009467b48Spatrick ///
61109467b48Spatrick /// By traversing the memory instructions in program order and accumulating this
61209467b48Spatrick /// number, we know whether any unsafe dependence crosses over a program point.
61309467b48Spatrick class MemoryInstructionDependences {
61409467b48Spatrick using Dependence = MemoryDepChecker::Dependence;
61509467b48Spatrick
61609467b48Spatrick public:
61709467b48Spatrick struct Entry {
61809467b48Spatrick Instruction *Inst;
61909467b48Spatrick unsigned NumUnsafeDependencesStartOrEnd = 0;
62009467b48Spatrick
Entry__anonf92e1f230111::MemoryInstructionDependences::Entry62109467b48Spatrick Entry(Instruction *Inst) : Inst(Inst) {}
62209467b48Spatrick };
62309467b48Spatrick
62409467b48Spatrick using AccessesType = SmallVector<Entry, 8>;
62509467b48Spatrick
begin() const62609467b48Spatrick AccessesType::const_iterator begin() const { return Accesses.begin(); }
end() const62709467b48Spatrick AccessesType::const_iterator end() const { return Accesses.end(); }
62809467b48Spatrick
MemoryInstructionDependences(const SmallVectorImpl<Instruction * > & Instructions,const SmallVectorImpl<Dependence> & Dependences)62909467b48Spatrick MemoryInstructionDependences(
63009467b48Spatrick const SmallVectorImpl<Instruction *> &Instructions,
63109467b48Spatrick const SmallVectorImpl<Dependence> &Dependences) {
63209467b48Spatrick Accesses.append(Instructions.begin(), Instructions.end());
63309467b48Spatrick
63409467b48Spatrick LLVM_DEBUG(dbgs() << "Backward dependences:\n");
635*d415bd75Srobert for (const auto &Dep : Dependences)
63609467b48Spatrick if (Dep.isPossiblyBackward()) {
63709467b48Spatrick // Note that the designations source and destination follow the program
63809467b48Spatrick // order, i.e. source is always first. (The direction is given by the
63909467b48Spatrick // DepType.)
64009467b48Spatrick ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd;
64109467b48Spatrick --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd;
64209467b48Spatrick
64309467b48Spatrick LLVM_DEBUG(Dep.print(dbgs(), 2, Instructions));
64409467b48Spatrick }
64509467b48Spatrick }
64609467b48Spatrick
64709467b48Spatrick private:
64809467b48Spatrick AccessesType Accesses;
64909467b48Spatrick };
65009467b48Spatrick
65109467b48Spatrick /// The actual class performing the per-loop work.
65209467b48Spatrick class LoopDistributeForLoop {
65309467b48Spatrick public:
LoopDistributeForLoop(Loop * L,Function * F,LoopInfo * LI,DominatorTree * DT,ScalarEvolution * SE,LoopAccessInfoManager & LAIs,OptimizationRemarkEmitter * ORE)65409467b48Spatrick LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT,
655*d415bd75Srobert ScalarEvolution *SE, LoopAccessInfoManager &LAIs,
656*d415bd75Srobert OptimizationRemarkEmitter *ORE)
657*d415bd75Srobert : L(L), F(F), LI(LI), DT(DT), SE(SE), LAIs(LAIs), ORE(ORE) {
65809467b48Spatrick setForced();
65909467b48Spatrick }
66009467b48Spatrick
66109467b48Spatrick /// Try to distribute an inner-most loop.
processLoop()662*d415bd75Srobert bool processLoop() {
66373471bf0Spatrick assert(L->isInnermost() && "Only process inner loops.");
66409467b48Spatrick
66509467b48Spatrick LLVM_DEBUG(dbgs() << "\nLDist: In \""
66609467b48Spatrick << L->getHeader()->getParent()->getName()
66709467b48Spatrick << "\" checking " << *L << "\n");
66809467b48Spatrick
66973471bf0Spatrick // Having a single exit block implies there's also one exiting block.
67009467b48Spatrick if (!L->getExitBlock())
67109467b48Spatrick return fail("MultipleExitBlocks", "multiple exit blocks");
67209467b48Spatrick if (!L->isLoopSimplifyForm())
67309467b48Spatrick return fail("NotLoopSimplifyForm",
67409467b48Spatrick "loop is not in loop-simplify form");
67573471bf0Spatrick if (!L->isRotatedForm())
67673471bf0Spatrick return fail("NotBottomTested", "loop is not bottom tested");
67709467b48Spatrick
67809467b48Spatrick BasicBlock *PH = L->getLoopPreheader();
67909467b48Spatrick
680*d415bd75Srobert LAI = &LAIs.getInfo(*L);
68109467b48Spatrick
68209467b48Spatrick // Currently, we only distribute to isolate the part of the loop with
68309467b48Spatrick // dependence cycles to enable partial vectorization.
68409467b48Spatrick if (LAI->canVectorizeMemory())
68509467b48Spatrick return fail("MemOpsCanBeVectorized",
68609467b48Spatrick "memory operations are safe for vectorization");
68709467b48Spatrick
68809467b48Spatrick auto *Dependences = LAI->getDepChecker().getDependences();
68909467b48Spatrick if (!Dependences || Dependences->empty())
69009467b48Spatrick return fail("NoUnsafeDeps", "no unsafe dependences to isolate");
69109467b48Spatrick
69209467b48Spatrick InstPartitionContainer Partitions(L, LI, DT);
69309467b48Spatrick
69409467b48Spatrick // First, go through each memory operation and assign them to consecutive
69509467b48Spatrick // partitions (the order of partitions follows program order). Put those
69609467b48Spatrick // with unsafe dependences into "cyclic" partition otherwise put each store
69709467b48Spatrick // in its own "non-cyclic" partition (we'll merge these later).
69809467b48Spatrick //
69909467b48Spatrick // Note that a memory operation (e.g. Load2 below) at a program point that
70009467b48Spatrick // has an unsafe dependence (Store3->Load1) spanning over it must be
70109467b48Spatrick // included in the same cyclic partition as the dependent operations. This
70209467b48Spatrick // is to preserve the original program order after distribution. E.g.:
70309467b48Spatrick //
70409467b48Spatrick // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive
70509467b48Spatrick // Load1 -. 1 0->1
70609467b48Spatrick // Load2 | /Unsafe/ 0 1
70709467b48Spatrick // Store3 -' -1 1->0
70809467b48Spatrick // Load4 0 0
70909467b48Spatrick //
71009467b48Spatrick // NumUnsafeDependencesActive > 0 indicates this situation and in this case
71109467b48Spatrick // we just keep assigning to the same cyclic partition until
71209467b48Spatrick // NumUnsafeDependencesActive reaches 0.
71309467b48Spatrick const MemoryDepChecker &DepChecker = LAI->getDepChecker();
71409467b48Spatrick MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(),
71509467b48Spatrick *Dependences);
71609467b48Spatrick
71709467b48Spatrick int NumUnsafeDependencesActive = 0;
718*d415bd75Srobert for (const auto &InstDep : MID) {
71909467b48Spatrick Instruction *I = InstDep.Inst;
72009467b48Spatrick // We update NumUnsafeDependencesActive post-instruction, catch the
72109467b48Spatrick // start of a dependence directly via NumUnsafeDependencesStartOrEnd.
72209467b48Spatrick if (NumUnsafeDependencesActive ||
72309467b48Spatrick InstDep.NumUnsafeDependencesStartOrEnd > 0)
72409467b48Spatrick Partitions.addToCyclicPartition(I);
72509467b48Spatrick else
72609467b48Spatrick Partitions.addToNewNonCyclicPartition(I);
72709467b48Spatrick NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd;
72809467b48Spatrick assert(NumUnsafeDependencesActive >= 0 &&
72909467b48Spatrick "Negative number of dependences active");
73009467b48Spatrick }
73109467b48Spatrick
73209467b48Spatrick // Add partitions for values used outside. These partitions can be out of
73309467b48Spatrick // order from the original program order. This is OK because if the
73409467b48Spatrick // partition uses a load we will merge this partition with the original
73509467b48Spatrick // partition of the load that we set up in the previous loop (see
73609467b48Spatrick // mergeToAvoidDuplicatedLoads).
73709467b48Spatrick auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L);
73809467b48Spatrick for (auto *Inst : DefsUsedOutside)
73909467b48Spatrick Partitions.addToNewNonCyclicPartition(Inst);
74009467b48Spatrick
74109467b48Spatrick LLVM_DEBUG(dbgs() << "Seeded partitions:\n" << Partitions);
74209467b48Spatrick if (Partitions.getSize() < 2)
74309467b48Spatrick return fail("CantIsolateUnsafeDeps",
74409467b48Spatrick "cannot isolate unsafe dependencies");
74509467b48Spatrick
74609467b48Spatrick // Run the merge heuristics: Merge non-cyclic adjacent partitions since we
74709467b48Spatrick // should be able to vectorize these together.
74809467b48Spatrick Partitions.mergeBeforePopulating();
74909467b48Spatrick LLVM_DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions);
75009467b48Spatrick if (Partitions.getSize() < 2)
75109467b48Spatrick return fail("CantIsolateUnsafeDeps",
75209467b48Spatrick "cannot isolate unsafe dependencies");
75309467b48Spatrick
75409467b48Spatrick // Now, populate the partitions with non-memory operations.
75509467b48Spatrick Partitions.populateUsedSet();
75609467b48Spatrick LLVM_DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions);
75709467b48Spatrick
75809467b48Spatrick // In order to preserve original lexical order for loads, keep them in the
75909467b48Spatrick // partition that we set up in the MemoryInstructionDependences loop.
76009467b48Spatrick if (Partitions.mergeToAvoidDuplicatedLoads()) {
76109467b48Spatrick LLVM_DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n"
76209467b48Spatrick << Partitions);
76309467b48Spatrick if (Partitions.getSize() < 2)
76409467b48Spatrick return fail("CantIsolateUnsafeDeps",
76509467b48Spatrick "cannot isolate unsafe dependencies");
76609467b48Spatrick }
76709467b48Spatrick
76809467b48Spatrick // Don't distribute the loop if we need too many SCEV run-time checks, or
76909467b48Spatrick // any if it's illegal.
770*d415bd75Srobert const SCEVPredicate &Pred = LAI->getPSE().getPredicate();
77109467b48Spatrick if (LAI->hasConvergentOp() && !Pred.isAlwaysTrue()) {
77209467b48Spatrick return fail("RuntimeCheckWithConvergent",
77309467b48Spatrick "may not insert runtime check with convergent operation");
77409467b48Spatrick }
77509467b48Spatrick
776*d415bd75Srobert if (Pred.getComplexity() > (IsForced.value_or(false)
77709467b48Spatrick ? PragmaDistributeSCEVCheckThreshold
77809467b48Spatrick : DistributeSCEVCheckThreshold))
77909467b48Spatrick return fail("TooManySCEVRuntimeChecks",
78009467b48Spatrick "too many SCEV run-time checks needed.\n");
78109467b48Spatrick
782*d415bd75Srobert if (!IsForced.value_or(false) && hasDisableAllTransformsHint(L))
78309467b48Spatrick return fail("HeuristicDisabled", "distribution heuristic disabled");
78409467b48Spatrick
78509467b48Spatrick LLVM_DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n");
78609467b48Spatrick // We're done forming the partitions set up the reverse mapping from
78709467b48Spatrick // instructions to partitions.
78809467b48Spatrick Partitions.setupPartitionIdOnInstructions();
78909467b48Spatrick
79009467b48Spatrick // If we need run-time checks, version the loop now.
79109467b48Spatrick auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI);
79209467b48Spatrick const auto *RtPtrChecking = LAI->getRuntimePointerChecking();
79309467b48Spatrick const auto &AllChecks = RtPtrChecking->getChecks();
79409467b48Spatrick auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition,
79509467b48Spatrick RtPtrChecking);
79609467b48Spatrick
79709467b48Spatrick if (LAI->hasConvergentOp() && !Checks.empty()) {
79809467b48Spatrick return fail("RuntimeCheckWithConvergent",
79909467b48Spatrick "may not insert runtime check with convergent operation");
80009467b48Spatrick }
80109467b48Spatrick
802097a140dSpatrick // To keep things simple have an empty preheader before we version or clone
803097a140dSpatrick // the loop. (Also split if this has no predecessor, i.e. entry, because we
804097a140dSpatrick // rely on PH having a predecessor.)
805097a140dSpatrick if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator())
806097a140dSpatrick SplitBlock(PH, PH->getTerminator(), DT, LI);
807097a140dSpatrick
80809467b48Spatrick if (!Pred.isAlwaysTrue() || !Checks.empty()) {
80909467b48Spatrick assert(!LAI->hasConvergentOp() && "inserting illegal loop versioning");
81009467b48Spatrick
81109467b48Spatrick MDNode *OrigLoopID = L->getLoopID();
81209467b48Spatrick
81309467b48Spatrick LLVM_DEBUG(dbgs() << "\nPointers:\n");
81409467b48Spatrick LLVM_DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks));
81573471bf0Spatrick LoopVersioning LVer(*LAI, Checks, L, LI, DT, SE);
81609467b48Spatrick LVer.versionLoop(DefsUsedOutside);
81709467b48Spatrick LVer.annotateLoopWithNoAlias();
81809467b48Spatrick
81909467b48Spatrick // The unversioned loop will not be changed, so we inherit all attributes
82009467b48Spatrick // from the original loop, but remove the loop distribution metadata to
82109467b48Spatrick // avoid to distribute it again.
822*d415bd75Srobert MDNode *UnversionedLoopID = *makeFollowupLoopID(
823*d415bd75Srobert OrigLoopID,
824*d415bd75Srobert {LLVMLoopDistributeFollowupAll, LLVMLoopDistributeFollowupFallback},
825*d415bd75Srobert "llvm.loop.distribute.", true);
82609467b48Spatrick LVer.getNonVersionedLoop()->setLoopID(UnversionedLoopID);
82709467b48Spatrick }
82809467b48Spatrick
82909467b48Spatrick // Create identical copies of the original loop for each partition and hook
83009467b48Spatrick // them up sequentially.
83109467b48Spatrick Partitions.cloneLoops();
83209467b48Spatrick
83309467b48Spatrick // Now, we remove the instruction from each loop that don't belong to that
83409467b48Spatrick // partition.
83509467b48Spatrick Partitions.removeUnusedInsts();
83609467b48Spatrick LLVM_DEBUG(dbgs() << "\nAfter removing unused Instrs:\n");
83709467b48Spatrick LLVM_DEBUG(Partitions.printBlocks());
83809467b48Spatrick
83909467b48Spatrick if (LDistVerify) {
84009467b48Spatrick LI->verify(*DT);
84109467b48Spatrick assert(DT->verify(DominatorTree::VerificationLevel::Fast));
84209467b48Spatrick }
84309467b48Spatrick
84409467b48Spatrick ++NumLoopsDistributed;
84509467b48Spatrick // Report the success.
84609467b48Spatrick ORE->emit([&]() {
84709467b48Spatrick return OptimizationRemark(LDIST_NAME, "Distribute", L->getStartLoc(),
84809467b48Spatrick L->getHeader())
84909467b48Spatrick << "distributed loop";
85009467b48Spatrick });
85109467b48Spatrick return true;
85209467b48Spatrick }
85309467b48Spatrick
85409467b48Spatrick /// Provide diagnostics then \return with false.
fail(StringRef RemarkName,StringRef Message)85509467b48Spatrick bool fail(StringRef RemarkName, StringRef Message) {
85609467b48Spatrick LLVMContext &Ctx = F->getContext();
857*d415bd75Srobert bool Forced = isForced().value_or(false);
85809467b48Spatrick
85909467b48Spatrick LLVM_DEBUG(dbgs() << "Skipping; " << Message << "\n");
86009467b48Spatrick
86109467b48Spatrick // With Rpass-missed report that distribution failed.
86209467b48Spatrick ORE->emit([&]() {
86309467b48Spatrick return OptimizationRemarkMissed(LDIST_NAME, "NotDistributed",
86409467b48Spatrick L->getStartLoc(), L->getHeader())
86509467b48Spatrick << "loop not distributed: use -Rpass-analysis=loop-distribute for "
86609467b48Spatrick "more "
86709467b48Spatrick "info";
86809467b48Spatrick });
86909467b48Spatrick
87009467b48Spatrick // With Rpass-analysis report why. This is on by default if distribution
87109467b48Spatrick // was requested explicitly.
87209467b48Spatrick ORE->emit(OptimizationRemarkAnalysis(
87309467b48Spatrick Forced ? OptimizationRemarkAnalysis::AlwaysPrint : LDIST_NAME,
87409467b48Spatrick RemarkName, L->getStartLoc(), L->getHeader())
87509467b48Spatrick << "loop not distributed: " << Message);
87609467b48Spatrick
87709467b48Spatrick // Also issue a warning if distribution was requested explicitly but it
87809467b48Spatrick // failed.
87909467b48Spatrick if (Forced)
88009467b48Spatrick Ctx.diagnose(DiagnosticInfoOptimizationFailure(
88109467b48Spatrick *F, L->getStartLoc(), "loop not distributed: failed "
88209467b48Spatrick "explicitly specified loop distribution"));
88309467b48Spatrick
88409467b48Spatrick return false;
88509467b48Spatrick }
88609467b48Spatrick
88709467b48Spatrick /// Return if distribution forced to be enabled/disabled for the loop.
88809467b48Spatrick ///
88909467b48Spatrick /// If the optional has a value, it indicates whether distribution was forced
89009467b48Spatrick /// to be enabled (true) or disabled (false). If the optional has no value
89109467b48Spatrick /// distribution was not forced either way.
isForced() const892*d415bd75Srobert const std::optional<bool> &isForced() const { return IsForced; }
89309467b48Spatrick
89409467b48Spatrick private:
89509467b48Spatrick /// Filter out checks between pointers from the same partition.
89609467b48Spatrick ///
89709467b48Spatrick /// \p PtrToPartition contains the partition number for pointers. Partition
89809467b48Spatrick /// number -1 means that the pointer is used in multiple partitions. In this
89909467b48Spatrick /// case we can't safely omit the check.
includeOnlyCrossPartitionChecks(const SmallVectorImpl<RuntimePointerCheck> & AllChecks,const SmallVectorImpl<int> & PtrToPartition,const RuntimePointerChecking * RtPtrChecking)900097a140dSpatrick SmallVector<RuntimePointerCheck, 4> includeOnlyCrossPartitionChecks(
901097a140dSpatrick const SmallVectorImpl<RuntimePointerCheck> &AllChecks,
90209467b48Spatrick const SmallVectorImpl<int> &PtrToPartition,
90309467b48Spatrick const RuntimePointerChecking *RtPtrChecking) {
904097a140dSpatrick SmallVector<RuntimePointerCheck, 4> Checks;
90509467b48Spatrick
90609467b48Spatrick copy_if(AllChecks, std::back_inserter(Checks),
907097a140dSpatrick [&](const RuntimePointerCheck &Check) {
90809467b48Spatrick for (unsigned PtrIdx1 : Check.first->Members)
90909467b48Spatrick for (unsigned PtrIdx2 : Check.second->Members)
91009467b48Spatrick // Only include this check if there is a pair of pointers
91109467b48Spatrick // that require checking and the pointers fall into
91209467b48Spatrick // separate partitions.
91309467b48Spatrick //
91409467b48Spatrick // (Note that we already know at this point that the two
91509467b48Spatrick // pointer groups need checking but it doesn't follow
91609467b48Spatrick // that each pair of pointers within the two groups need
91709467b48Spatrick // checking as well.
91809467b48Spatrick //
91909467b48Spatrick // In other words we don't want to include a check just
92009467b48Spatrick // because there is a pair of pointers between the two
92109467b48Spatrick // pointer groups that require checks and a different
92209467b48Spatrick // pair whose pointers fall into different partitions.)
92309467b48Spatrick if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
92409467b48Spatrick !RuntimePointerChecking::arePointersInSamePartition(
92509467b48Spatrick PtrToPartition, PtrIdx1, PtrIdx2))
92609467b48Spatrick return true;
92709467b48Spatrick return false;
92809467b48Spatrick });
92909467b48Spatrick
93009467b48Spatrick return Checks;
93109467b48Spatrick }
93209467b48Spatrick
93309467b48Spatrick /// Check whether the loop metadata is forcing distribution to be
93409467b48Spatrick /// enabled/disabled.
setForced()93509467b48Spatrick void setForced() {
936*d415bd75Srobert std::optional<const MDOperand *> Value =
93709467b48Spatrick findStringMetadataForLoop(L, "llvm.loop.distribute.enable");
93809467b48Spatrick if (!Value)
93909467b48Spatrick return;
94009467b48Spatrick
94109467b48Spatrick const MDOperand *Op = *Value;
94209467b48Spatrick assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
94309467b48Spatrick IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue();
94409467b48Spatrick }
94509467b48Spatrick
94609467b48Spatrick Loop *L;
94709467b48Spatrick Function *F;
94809467b48Spatrick
94909467b48Spatrick // Analyses used.
95009467b48Spatrick LoopInfo *LI;
95109467b48Spatrick const LoopAccessInfo *LAI = nullptr;
95209467b48Spatrick DominatorTree *DT;
95309467b48Spatrick ScalarEvolution *SE;
954*d415bd75Srobert LoopAccessInfoManager &LAIs;
95509467b48Spatrick OptimizationRemarkEmitter *ORE;
95609467b48Spatrick
95709467b48Spatrick /// Indicates whether distribution is forced to be enabled/disabled for
95809467b48Spatrick /// the loop.
95909467b48Spatrick ///
96009467b48Spatrick /// If the optional has a value, it indicates whether distribution was forced
96109467b48Spatrick /// to be enabled (true) or disabled (false). If the optional has no value
96209467b48Spatrick /// distribution was not forced either way.
963*d415bd75Srobert std::optional<bool> IsForced;
96409467b48Spatrick };
96509467b48Spatrick
96609467b48Spatrick } // end anonymous namespace
96709467b48Spatrick
96809467b48Spatrick /// Shared implementation between new and old PMs.
runImpl(Function & F,LoopInfo * LI,DominatorTree * DT,ScalarEvolution * SE,OptimizationRemarkEmitter * ORE,LoopAccessInfoManager & LAIs)96909467b48Spatrick static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT,
97009467b48Spatrick ScalarEvolution *SE, OptimizationRemarkEmitter *ORE,
971*d415bd75Srobert LoopAccessInfoManager &LAIs) {
97209467b48Spatrick // Build up a worklist of inner-loops to vectorize. This is necessary as the
97309467b48Spatrick // act of distributing a loop creates new loops and can invalidate iterators
97409467b48Spatrick // across the loops.
97509467b48Spatrick SmallVector<Loop *, 8> Worklist;
97609467b48Spatrick
97709467b48Spatrick for (Loop *TopLevelLoop : *LI)
97809467b48Spatrick for (Loop *L : depth_first(TopLevelLoop))
97909467b48Spatrick // We only handle inner-most loops.
98073471bf0Spatrick if (L->isInnermost())
98109467b48Spatrick Worklist.push_back(L);
98209467b48Spatrick
98309467b48Spatrick // Now walk the identified inner loops.
98409467b48Spatrick bool Changed = false;
98509467b48Spatrick for (Loop *L : Worklist) {
986*d415bd75Srobert LoopDistributeForLoop LDL(L, &F, LI, DT, SE, LAIs, ORE);
98709467b48Spatrick
98809467b48Spatrick // If distribution was forced for the specific loop to be
98909467b48Spatrick // enabled/disabled, follow that. Otherwise use the global flag.
990*d415bd75Srobert if (LDL.isForced().value_or(EnableLoopDistribute))
991*d415bd75Srobert Changed |= LDL.processLoop();
99209467b48Spatrick }
99309467b48Spatrick
99409467b48Spatrick // Process each loop nest in the function.
99509467b48Spatrick return Changed;
99609467b48Spatrick }
99709467b48Spatrick
99809467b48Spatrick namespace {
99909467b48Spatrick
100009467b48Spatrick /// The pass class.
100109467b48Spatrick class LoopDistributeLegacy : public FunctionPass {
100209467b48Spatrick public:
100309467b48Spatrick static char ID;
100409467b48Spatrick
LoopDistributeLegacy()100509467b48Spatrick LoopDistributeLegacy() : FunctionPass(ID) {
100609467b48Spatrick // The default is set by the caller.
100709467b48Spatrick initializeLoopDistributeLegacyPass(*PassRegistry::getPassRegistry());
100809467b48Spatrick }
100909467b48Spatrick
runOnFunction(Function & F)101009467b48Spatrick bool runOnFunction(Function &F) override {
101109467b48Spatrick if (skipFunction(F))
101209467b48Spatrick return false;
101309467b48Spatrick
101409467b48Spatrick auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
101509467b48Spatrick auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
101609467b48Spatrick auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
101709467b48Spatrick auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1018*d415bd75Srobert auto &LAIs = getAnalysis<LoopAccessLegacyAnalysis>().getLAIs();
101909467b48Spatrick
1020*d415bd75Srobert return runImpl(F, LI, DT, SE, ORE, LAIs);
102109467b48Spatrick }
102209467b48Spatrick
getAnalysisUsage(AnalysisUsage & AU) const102309467b48Spatrick void getAnalysisUsage(AnalysisUsage &AU) const override {
102409467b48Spatrick AU.addRequired<ScalarEvolutionWrapperPass>();
102509467b48Spatrick AU.addRequired<LoopInfoWrapperPass>();
102609467b48Spatrick AU.addPreserved<LoopInfoWrapperPass>();
102709467b48Spatrick AU.addRequired<LoopAccessLegacyAnalysis>();
102809467b48Spatrick AU.addRequired<DominatorTreeWrapperPass>();
102909467b48Spatrick AU.addPreserved<DominatorTreeWrapperPass>();
103009467b48Spatrick AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
103109467b48Spatrick AU.addPreserved<GlobalsAAWrapperPass>();
103209467b48Spatrick }
103309467b48Spatrick };
103409467b48Spatrick
103509467b48Spatrick } // end anonymous namespace
103609467b48Spatrick
run(Function & F,FunctionAnalysisManager & AM)103709467b48Spatrick PreservedAnalyses LoopDistributePass::run(Function &F,
103809467b48Spatrick FunctionAnalysisManager &AM) {
103909467b48Spatrick auto &LI = AM.getResult<LoopAnalysis>(F);
104009467b48Spatrick auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
104109467b48Spatrick auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
104209467b48Spatrick auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
104309467b48Spatrick
1044*d415bd75Srobert LoopAccessInfoManager &LAIs = AM.getResult<LoopAccessAnalysis>(F);
1045*d415bd75Srobert bool Changed = runImpl(F, &LI, &DT, &SE, &ORE, LAIs);
104609467b48Spatrick if (!Changed)
104709467b48Spatrick return PreservedAnalyses::all();
104809467b48Spatrick PreservedAnalyses PA;
104909467b48Spatrick PA.preserve<LoopAnalysis>();
105009467b48Spatrick PA.preserve<DominatorTreeAnalysis>();
105109467b48Spatrick return PA;
105209467b48Spatrick }
105309467b48Spatrick
105409467b48Spatrick char LoopDistributeLegacy::ID;
105509467b48Spatrick
105609467b48Spatrick static const char ldist_name[] = "Loop Distribution";
105709467b48Spatrick
INITIALIZE_PASS_BEGIN(LoopDistributeLegacy,LDIST_NAME,ldist_name,false,false)105809467b48Spatrick INITIALIZE_PASS_BEGIN(LoopDistributeLegacy, LDIST_NAME, ldist_name, false,
105909467b48Spatrick false)
106009467b48Spatrick INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
106109467b48Spatrick INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
106209467b48Spatrick INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
106309467b48Spatrick INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
106409467b48Spatrick INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
106509467b48Spatrick INITIALIZE_PASS_END(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, false)
106609467b48Spatrick
106709467b48Spatrick FunctionPass *llvm::createLoopDistributePass() { return new LoopDistributeLegacy(); }
1068